Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Tecorigin SDAA accelerator support #3330

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/usage_guides/big_modeling.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ with init_empty_weights():

Next, the weights are loaded into the model for inference.

The [`load_checkpoint_and_dispatch`] method loads a checkpoint inside your empty model and dispatches the weights for each layer across all available devices, starting with the fastest devices (GPU, MPS, XPU, NPU, MLU, MUSA) first before moving to the slower ones (CPU and hard drive).
The [`load_checkpoint_and_dispatch`] method loads a checkpoint inside your empty model and dispatches the weights for each layer across all available devices, starting with the fastest devices (GPU, MPS, XPU, NPU, MLU, SDAA, MUSA) first before moving to the slower ones (CPU and hard drive).

Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.

Expand Down
5 changes: 4 additions & 1 deletion src/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ def __init__(
and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
):
self.native_amp = True
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa") or is_torch_xla_available(
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa", "sdaa") or is_torch_xla_available(
check_is_tpu=True
):
raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
Expand Down Expand Up @@ -1151,6 +1151,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_XPU,
):
Expand Down Expand Up @@ -1454,6 +1455,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
if self.distributed_type in (
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
Expand Down Expand Up @@ -3228,6 +3230,7 @@ def _inner(folder):
if self.num_processes > 1 and self.distributed_type in (
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
):
Expand Down
5 changes: 5 additions & 0 deletions src/accelerate/big_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
infer_auto_device_map,
is_bnb_available,
is_mlu_available,
is_sdaa_available,
is_musa_available,
is_npu_available,
is_torch_version,
Expand Down Expand Up @@ -468,6 +469,8 @@ def wrapper(*args, **kwargs):
model.npu = add_warning(model.npu, model)
elif is_mlu_available():
model.mlu = add_warning(model.mlu, model)
elif is_sdaa_available():
model.sdaa = add_warning(model.sdaa, model)
elif is_musa_available():
model.musa = add_warning(model.musa, model)
elif is_xpu_available():
Expand All @@ -490,6 +493,8 @@ def wrapper(*args, **kwargs):
device = f"npu:{device}"
elif is_mlu_available() and isinstance(device, int):
device = f"mlu:{device}"
elif is_sdaa_available() and isinstance(device, int):
device = f"sdaa:{device}"
elif is_musa_available() and isinstance(device, int):
device = f"musa:{device}"
elif is_xpu_available() and isinstance(device, int):
Expand Down
5 changes: 5 additions & 0 deletions src/accelerate/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
WEIGHTS_NAME,
get_pretty_name,
is_mlu_available,
is_sdaa_available,
is_torch_xla_available,
is_xpu_available,
load,
Expand Down Expand Up @@ -152,6 +153,8 @@ def save_accelerator_state(
states["torch_xpu_manual_seed"] = torch.xpu.get_rng_state_all()
if is_mlu_available():
states["torch_mlu_manual_seed"] = torch.mlu.get_rng_state_all()
elif is_sdaa_available():
states["torch_sdaa_manual_seed"] = torch.sdaa.get_rng_state_all()
else:
states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
if is_torch_xla_available():
Expand Down Expand Up @@ -275,6 +278,8 @@ def load_accelerator_state(
torch.xpu.set_rng_state_all(states["torch_xpu_manual_seed"])
if is_mlu_available():
torch.mlu.set_rng_state_all(states["torch_mlu_manual_seed"])
elif is_sdaa_available():
torch.sdaa.set_rng_state_all(states["torch_sdaa_manual_seed"])
else:
torch.cuda.set_rng_state_all(states["torch_cuda_manual_seed"])
if is_torch_xla_available():
Expand Down
10 changes: 10 additions & 0 deletions src/accelerate/commands/config/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
is_deepspeed_available,
is_fp8_available,
is_mlu_available,
is_sdaa_available,
is_mps_available,
is_msamp_available,
is_musa_available,
Expand Down Expand Up @@ -61,6 +62,7 @@ def get_cluster_input():
"multi-GPU",
"multi-NPU",
"multi-MLU",
"multi-SDAA",
"multi-MUSA",
"TPU",
],
Expand All @@ -80,6 +82,7 @@ def get_cluster_input():
if distributed_type in [
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
Expand Down Expand Up @@ -164,6 +167,7 @@ def get_cluster_input():
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.XLA,
DistributedType.MULTI_MUSA,
]
Expand Down Expand Up @@ -226,6 +230,7 @@ def get_cluster_input():
DistributedType.MULTI_XPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.NO,
]
Expand Down Expand Up @@ -380,6 +385,7 @@ def get_cluster_input():
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_XPU,
]:
Expand Down Expand Up @@ -552,6 +558,7 @@ def get_cluster_input():
DistributedType.MULTI_XPU,
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.XLA,
Expand Down Expand Up @@ -589,6 +596,7 @@ def get_cluster_input():
in [
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
Expand All @@ -601,6 +609,8 @@ def get_cluster_input():
machine_type = "NPU(s)"
elif is_mlu_available():
machine_type = "MLU(s)"
elif is_sdaa_available():
machine_type = "SDAA(s)"
elif is_musa_available():
machine_type = "MUSA(s)"
elif is_xpu_available():
Expand Down
2 changes: 1 addition & 1 deletion src/accelerate/commands/config/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def _convert_compute_environment(value):
def _convert_distributed_mode(value):
value = int(value)
return DistributedType(
["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "MULTI_MUSA", "XLA"][value]
["NO", "MULTI_CPU", "MULTI_XPU", "MULTI_GPU", "MULTI_NPU", "MULTI_MLU", "MULTI_SDAA", "MULTI_MUSA", "XLA"][value]
)


Expand Down
10 changes: 9 additions & 1 deletion src/accelerate/commands/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import torch

from ...utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
from ...utils import is_mlu_available, is_sdaa_available, is_musa_available, is_npu_available, is_xpu_available
from .config_args import ClusterConfig, default_json_config_file
from .config_utils import SubcommandHelpFormatter

Expand Down Expand Up @@ -65,6 +65,14 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
config["distributed_type"] = "MULTI_MLU"
else:
config["distributed_type"] = "NO"
if is_sdaa_available():
num_sdaas = torch.sdaa.device_count()
config["num_processes"] = num_sdaas
config["use_cpu"] = False
if num_sdaas > 1:
config["distributed_type"] = "MULTI_SDAA"
else:
config["distributed_type"] = "NO"
elif is_musa_available():
num_musas = torch.musa.device_count()
config["num_processes"] = num_musas
Expand Down
6 changes: 5 additions & 1 deletion src/accelerate/commands/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from accelerate import __version__ as version
from accelerate.commands.config import default_config_file, load_config_from_file

from ..utils import is_mlu_available, is_musa_available, is_npu_available, is_xpu_available
from ..utils import is_mlu_available, is_sdaa_available, is_musa_available, is_npu_available, is_xpu_available


def env_command_parser(subparsers=None):
Expand All @@ -49,6 +49,7 @@ def env_command(args):
pt_cuda_available = torch.cuda.is_available()
pt_xpu_available = is_xpu_available()
pt_mlu_available = is_mlu_available()
pt_sdaa_available = is_sdaa_available()
pt_musa_available = is_musa_available()
pt_npu_available = is_npu_available()

Expand Down Expand Up @@ -76,13 +77,16 @@ def env_command(args):
"PyTorch XPU available": str(pt_xpu_available),
"PyTorch NPU available": str(pt_npu_available),
"PyTorch MLU available": str(pt_mlu_available),
"PyTorch SDAA available": str(pt_sdaa_available),
"PyTorch MUSA available": str(pt_musa_available),
"System RAM": f"{psutil.virtual_memory().total / 1024 ** 3:.2f} GB",
}
if pt_cuda_available:
info["GPU type"] = torch.cuda.get_device_name()
if pt_mlu_available:
info["MLU type"] = torch.mlu.get_device_name()
if pt_sdaa_available:
info["SDAA type"] = torch.sdaa.get_device_name()
if pt_npu_available:
info["CANN version"] = torch.version.cann

Expand Down
5 changes: 5 additions & 0 deletions src/accelerate/commands/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
is_bf16_available,
is_deepspeed_available,
is_mlu_available,
is_sdaa_available,
is_musa_available,
is_npu_available,
is_rich_available,
Expand Down Expand Up @@ -994,6 +995,7 @@ def _validate_launch_command(args):
DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_XPU,
)
Expand Down Expand Up @@ -1076,6 +1078,8 @@ def _validate_launch_command(args):
args.num_processes = torch.xpu.device_count()
elif is_mlu_available():
args.num_processes = torch.mlu.device_count()
elif is_sdaa_available():
args.num_processes = torch.sdaa.device_count()
elif is_musa_available():
args.num_processes = torch.musa.device_count()
elif is_npu_available():
Expand All @@ -1091,6 +1095,7 @@ def _validate_launch_command(args):
and (
(args.use_xpu and is_xpu_available() and torch.xpu.device_count() > 1)
or (is_mlu_available() and torch.mlu.device_count() > 1)
or (is_sdaa_available() and torch.sdaa.device_count() > 1)
or (is_musa_available() and torch.musa.device_count() > 1)
or (is_npu_available() and torch.npu.device_count() > 1)
or (torch.cuda.device_count() > 1)
Expand Down
2 changes: 1 addition & 1 deletion src/accelerate/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from .utils.other import recursive_getattr


_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "musa"]
_accelerate_added_attributes = ["to", "cuda", "npu", "xpu", "mlu", "sdaa", "musa"]


class ModelHook:
Expand Down
1 change: 1 addition & 0 deletions src/accelerate/local_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def __init__(self, accelerator: Accelerator, model: torch.nn.Module, local_sgd_s
DistributedType.MULTI_GPU,
DistributedType.MULTI_XPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
]:
Expand Down
20 changes: 19 additions & 1 deletion src/accelerate/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
is_fp8_available,
is_ipex_available,
is_mlu_available,
is_sdaa_available,
is_mps_available,
is_musa_available,
is_npu_available,
Expand All @@ -58,6 +59,9 @@
if is_mlu_available(check_device=False):
import torch_mlu # noqa: F401

if is_sdaa_available(check_device=False):
import torch_sdaa # noqa: F401

if is_musa_available(check_device=False):
import torch_musa # noqa: F401

Expand Down Expand Up @@ -201,6 +205,9 @@ def __init__(self, cpu: bool = False, **kwargs):
from deepspeed import comm as dist

if not dist.is_initialized():
if self.backend == 'tccl':
local_rank = os.environ.get("LOCAL_RANK", -1)
torch.sdaa.set_device(f'sdaa:{local_rank}')
dist.init_distributed(dist_backend=self.backend, auto_mpi_discovery=False, **kwargs)
# We need to flag to `use_deepspeed` to be True to override `distributed_type` later
use_deepspeed = True
Expand All @@ -209,6 +216,9 @@ def __init__(self, cpu: bool = False, **kwargs):
self.distributed_type not in (DistributedType.MULTI_XPU, DistributedType.MULTI_CPU)
and not torch.distributed.is_initialized()
):
if self.backend == 'tccl':
local_rank = os.environ.get("LOCAL_RANK", -1)
torch.sdaa.set_device(f'sdaa:{local_rank}')
torch.distributed.init_process_group(backend=self.backend, **kwargs)
# XPU and CPU require special env configs to be set
if self.distributed_type in (DistributedType.MULTI_XPU, DistributedType.MULTI_CPU):
Expand Down Expand Up @@ -365,6 +375,7 @@ def wait_for_everyone(self):
if self.distributed_type in (
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
Expand Down Expand Up @@ -685,6 +696,7 @@ def default_device(self) -> torch.device:
- MPS if `torch.backends.mps.is_available()` and `torch.backends.mps.is_built()` both return True.
- CUDA if `torch.cuda.is_available()`
- MLU if `is_mlu_available()`
- SDAA if `is_sdaa_available()`
- MUSA if `is_musa_available()`
- NPU if `is_npu_available()`
- CPU otherwise
Expand All @@ -694,6 +706,8 @@ def default_device(self) -> torch.device:
return torch.device("mps")
elif is_mlu_available():
return torch.device("mlu")
elif is_sdaa_available():
return torch.device("sdaa")
elif is_musa_available():
return torch.device("musa")
# NPU should be checked before CUDA when using `transfer_to_npu`
Expand Down Expand Up @@ -724,6 +738,9 @@ def _prepare_backend(
if is_mlu_available():
backend = "cncl"
distributed_type = DistributedType.MULTI_MLU
if is_sdaa_available():
backend = "tccl"
distributed_type = DistributedType.MULTI_SDAA
elif is_musa_available():
backend = "mccl"
distributed_type = DistributedType.MULTI_MUSA
Expand Down Expand Up @@ -776,7 +793,7 @@ def set_device(self):
self.device = torch.device("cpu") if self._cpu else self.default_device
return
device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla"):
if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla", "sdaa"):
raise ValueError(
f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
)
Expand Down Expand Up @@ -906,6 +923,7 @@ def __init__(
elif self.distributed_type in [
DistributedType.MULTI_GPU,
DistributedType.MULTI_MLU,
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
Expand Down
1 change: 1 addition & 0 deletions src/accelerate/test_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
require_cuda,
require_huggingface_suite,
require_mlu,
require_sdaa,
require_mps,
require_multi_device,
require_multi_gpu,
Expand Down
Loading