support megatron llama (#3532)

Jintao-Huang · web-flow · commit 436ed07c9f83 · 2025-03-18T00:59:14.000+08:00
diff --git a/docs/source/Instruction/Megatron-SWIFT训练.md b/docs/source/Instruction/Megatron-SWIFT训练.md
@@ -1,6 +1,8 @@
 
 # Megatron-SWIFT训练
 
+SWIFT引入了Megatron的并行技术来加速大模型的训练，包括数据并行、张量并行、流水线并行、序列并行，上下文并行。支持Megatron训练的模型可以参考[支持的模型与数据集文档](./支持的模型和数据集.md)。
+
 ## 环境准备
 使用Megatron-SWIFT，除了安装swift依赖外，还需要安装以下内容：
 
@@ -15,7 +17,7 @@ cd apex
 pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
 ```
 
-依赖库Megatron-LM将会由swift进行git clone并安装，不需要用户手动安装。你也可以通过环境变量`MEGATRON_LM_PATH`指向已经下载好的repo路径（断网环境）。
+依赖库Megatron-LM将会由swift进行git clone并安装，不需要用户手动安装。你也可以通过环境变量`MEGATRON_LM_PATH`指向已经下载好的repo路径（断网环境，[core_r0.11.0分支](https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0)）。
 
 
 ## 快速入门案例
@@ -93,7 +95,7 @@ I am a language model developed by swift, you can call me swift-robot. How can I
 ```
 
 - 更多案例可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron)。
-
+- 若要进行预训练，你可以使用`megatron pt`替代`megatron sft`，这将会使用生成式的template进行训练。
 
 ## 命令行参数
 
@@ -202,7 +204,6 @@ I am a language model developed by swift, you can call me swift-robot. How can I
 - position_embedding_type: 位置编码的类型，可选为'learned_absolute'、'rope'、'relative'和'none'，默认为'rope'。
 - rotary_base: 默认为10000。
 - rotary_percent: 默认为1.。
-- rotary_seq_len_interpolation_factor: 序列长度差值系数，默认为None。
 - normalization: 可选为'LayerNorm', 'RMSNorm'，默认为RMSNorm。
 - norm_epsilon: 默认为1e-5。
 - swiglu: 使用swiglu替代默认的gelu。默认为True。
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
diff --git a/docs/source_en/Instruction/Megatron-SWIFT-Training.md b/docs/source_en/Instruction/Megatron-SWIFT-Training.md
@@ -1,6 +1,8 @@
 
 # Megatron-SWIFT Training
 
+SWIFT incorporates Megatron's parallelization techniques to accelerate the training of large models, including data parallelism, tensor parallelism, pipeline parallelism, sequence parallelism, and context parallelism. For models that support Megatron training, please refer to the [Supported Models and Datasets documentation](./Supported-models-and-datasets.md).
+
 ## Environment Setup
 
 To use Megatron-SWIFT, in addition to installing the `swift` dependencies, you also need to install the following:
@@ -16,7 +18,7 @@ cd apex
 pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
 ```
 
-The dependency library Megatron-LM will be git cloned and installed by swift, no manual installation by the user is required. You can also use the environment variable `MEGATRON_LM_PATH` to point to the already downloaded repo path (for offline environments).
+The dependency library Megatron-LM will be git cloned and installed by swift, no manual installation by the user is required. You can also use the environment variable `MEGATRON_LM_PATH` to point to the already downloaded repo path (for offline environments, use the [core_r0.11.0 branch](https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0)).
 
 
 ## Quick Start Example
@@ -99,7 +101,7 @@ I am a language model developed by swift, you can call me swift-robot. How can I
 ```
 
 - More cases can be viewed [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron).
-
+- For pretraining, you can use `megatron pt` instead of `megatron sft`, which will use a generative template for training.
 
 ## Command Line Arguments
 
@@ -215,7 +217,6 @@ I am a language model developed by swift, you can call me swift-robot. How can I
 - position_embedding_type: Type of positional embedding, options are 'learned_absolute', 'rope', 'relative', and 'none'. Default is 'rope'.
 - rotary_base: Default is 10000.
 - rotary_percent: Default is 1.
-- rotary_seq_len_interpolation_factor: Sequence length interpolation factor, default is None.
 - normalization: Options are 'LayerNorm', 'RMSNorm'. Default is RMSNorm.
 - norm_epsilon: Default is 1e-5.
 - swiglu: Uses swiglu instead of the default gelu. Default is True.
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py
@@ -1,6 +1,7 @@
 from typing import Any, List
 
 from swift.llm import MODEL_MAPPING, TEMPLATE_MAPPING, ModelType, TemplateType
+from swift.utils import is_megatron_available
 
 
 def get_url_suffix(model_id):
@@ -9,17 +10,36 @@ def get_url_suffix(model_id):
     return model_id
 
 
+def get_cache_mapping(fpath):
+    with open(fpath, 'r', encoding='utf-8') as f:
+        text = f.read()
+    idx = text.find('| Model ID |')
+    text = text[idx:]
+    text_list = text.split('\n')[2:]
+    cache_mapping = {}
+    for text in text_list:
+        if not text:
+            continue
+        items = text.split('|')
+        if len(items) < 6:
+            break
+        cache_mapping[items[1]] = items[5]
+    return cache_mapping
+
+
 def get_model_info_table():
     fpaths = ['docs/source/Instruction/支持的模型和数据集.md', 'docs/source_en/Instruction/Supported-models-and-datasets.md']
+    cache_mapping = get_cache_mapping(fpaths[0])
     end_words = [['### 多模态大模型', '## 数据集'], ['### Multimodal large models', '## Datasets']]
     result = [
         '| Model ID | Model Type | Default Template | '
-        'Requires | Tags | HF Model ID |\n'
+        'Requires | Support Megatron | Tags | HF Model ID |\n'
         '| -------- | -----------| ---------------- | '
-        '-------- | ---- | ----------- |\n'
+        '-------- | ---------------- | ---- | ----------- |\n'
     ] * 2
     res_llm: List[Any] = []
     res_mllm: List[Any] = []
+    mg_count = 0
     for template in TemplateType.get_template_name_list():
         assert template in TEMPLATE_MAPPING
 
@@ -40,12 +60,22 @@ def get_model_info_table():
                     hf_model_id = '-'
                 tags = ', '.join(group.tags or model_meta.tags) or '-'
                 requires = ', '.join(group.requires or model_meta.requires) or '-'
-                r = (f'|{ms_model_id}|{model_type}|{template}|{requires}|{tags}|{hf_model_id}|\n')
+                if is_megatron_available():
+                    from swift.megatron import model
+                    support_megatron = getattr(model_meta, 'support_megatron', False)
+                    if 'gptq' in ms_model_id.lower() or 'awq' in ms_model_id.lower() or 'int' in ms_model_id.lower():
+                        support_megatron = False
+                    support_megatron = '&#x2714;' if support_megatron else '&#x2718;'
+                else:
+                    support_megatron = cache_mapping.get(ms_model_id, '&#x2718;')
+                if support_megatron == '&#x2714;':
+                    mg_count += 1
+                r = (f'|{ms_model_id}|{model_type}|{template}|{requires}|{support_megatron}|{tags}|{hf_model_id}|\n')
                 if model_meta.is_multimodal:
                     res_mllm.append(r)
                 else:
                     res_llm.append(r)
-    print(f'LLM总数: {len(res_llm)}, MLLM总数: {len(res_mllm)}')
+    print(f'LLM总数: {len(res_llm)}, MLLM总数: {len(res_mllm)}, Megatron支持模型: {mg_count}')
     text = ['', '']  # llm, mllm
     for i, res in enumerate([res_llm, res_mllm]):
         for r in res:
diff --git a/swift/megatron/argument/megatron_args.py b/swift/megatron/argument/megatron_args.py
@@ -90,7 +90,6 @@ class MegatronArguments(ExtraMegatronArguments):
     position_embedding_type: Literal['learned_absolute', 'rope', 'relative', 'none'] = 'rope'
     rotary_base: int = 10000
     rotary_percent: float = 1.
-    rotary_seq_len_interpolation_factor: Optional[int] = None
     normalization: Literal['LayerNorm', 'RMSNorm'] = 'RMSNorm'
     norm_epsilon: float = 1e-5
     swiglu: bool = True
diff --git a/swift/megatron/model/config.py b/swift/megatron/model/config.py
@@ -16,7 +16,9 @@
     'padded_vocab_size': ['vocab_size'],
     'attention_dropout': ['attention_dropout'],
     'untie_embeddings_and_output_weights': ['tie_word_embeddings'],
-    'swiglu': ['hidden_act']
+    'swiglu': ['hidden_act'],
+    'add_qkv_bias': ['attention_bias'],
+    'disable_bias_linear': ['mlp_bias']
 }
 
 
@@ -28,7 +30,7 @@ def convert_hf_config(config) -> Dict[str, Any]:
                 hf_v = getattr(config, hf_k)
                 if k == 'rotary_base':
                     megatron_config[k] = int(hf_v)
-                elif k == 'untie_embeddings_and_output_weights':
+                elif k in {'untie_embeddings_and_output_weights', 'disable_bias_linear'}:
                     megatron_config[k] = not hf_v
                 elif k == 'swiglu':
                     if hf_v == 'silu':
diff --git a/swift/megatron/model/gpt/__init__.py b/swift/megatron/model/gpt/__init__.py
@@ -9,5 +9,22 @@
 
 register_megatron_model(
     MegatronModelMeta(MegatronModelType.gpt, [
-        ModelType.qwen, ModelType.qwen2, ModelType.qwen2_5, ModelType.qwq, ModelType.qwq_preview, ModelType.qwen2_5_math
+        ModelType.qwen,
+        ModelType.qwen2,
+        ModelType.qwen2_5,
+        ModelType.qwq,
+        ModelType.qwq_preview,
+        ModelType.qwen2_5_math,
+        ModelType.llama3,
+        ModelType.llama,
+        ModelType.marco_o1,
+        ModelType.deepseek_r1_distill,
+        ModelType.yi,
+        ModelType.yi_coder,
+        ModelType.sus,
+        ModelType.skywork_o1,
+        ModelType.openbuddy_llama,
+        ModelType.megrez,
+        ModelType.numina,
+        ModelType.mengzi3,
     ], model_provider, convert_hf_config, convert_mcore2hf, convert_hf2mcore))
diff --git a/swift/megatron/model/gpt/hf2mcore.py b/swift/megatron/model/gpt/hf2mcore.py
@@ -20,13 +20,14 @@ def set_attn_state(args, mg_layer, hf_layer):
     mg_attn.linear_proj.weight.data.copy_(hf_attn.o_proj.weight)
 
     # Copy bias
-    mg_attn.linear_qkv.bias.data.copy_(
-        torch.cat([
-            hf_attn.q_proj.bias.reshape((num_query_groups, -1)),
-            hf_attn.k_proj.bias.reshape((num_query_groups, -1)),
-            hf_attn.v_proj.bias.reshape((num_query_groups, -1)),
-        ],
-                  dim=1).reshape(-1))
+    if args.add_qkv_bias:
+        mg_attn.linear_qkv.bias.data.copy_(
+            torch.cat([
+                hf_attn.q_proj.bias.reshape((num_query_groups, -1)),
+                hf_attn.k_proj.bias.reshape((num_query_groups, -1)),
+                hf_attn.v_proj.bias.reshape((num_query_groups, -1)),
+            ],
+                      dim=1).reshape(-1))
 
 
 def set_mlp_state(args, mg_layer, hf_layer):
diff --git a/swift/megatron/model/gpt/mcore2hf.py b/swift/megatron/model/gpt/mcore2hf.py
@@ -17,10 +17,11 @@ def set_attn_state(args, mg_layer, hf_layer):
     hf_attn.o_proj.weight.data.copy_(mg_attn.linear_proj.weight)
 
     # Copy bias
-    mg_attn_bias = mg_attn.linear_qkv.bias.reshape((num_query_groups, -1))
-    hf_attn.q_proj.bias.data.copy_(mg_attn_bias[:, :q_dim].reshape(-1))
-    hf_attn.k_proj.bias.data.copy_(mg_attn_bias[:, q_dim:-kv_dim].reshape(-1))
-    hf_attn.v_proj.bias.data.copy_(mg_attn_bias[:, -kv_dim:].reshape(-1))
+    if args.add_qkv_bias:
+        mg_attn_bias = mg_attn.linear_qkv.bias.reshape((num_query_groups, -1))
+        hf_attn.q_proj.bias.data.copy_(mg_attn_bias[:, :q_dim].reshape(-1))
+        hf_attn.k_proj.bias.data.copy_(mg_attn_bias[:, q_dim:-kv_dim].reshape(-1))
+        hf_attn.v_proj.bias.data.copy_(mg_attn_bias[:, -kv_dim:].reshape(-1))
 
 
 def set_mlp_state(args, mg_layer, hf_layer):
diff --git a/swift/megatron/model/gpt/model.py b/swift/megatron/model/gpt/model.py
@@ -24,5 +24,7 @@ def model_provider(pre_process=True, post_process=True):
         position_embedding_type=args.position_embedding_type,
         rotary_percent=args.rotary_percent,
         rotary_base=args.rotary_base,
-        rope_scaling=args.use_rope_scaling)
+        rope_scaling=args.use_rope_scaling,
+        rope_scaling_factor=args.rope_scaling_factor,
+        seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor)
     return model
diff --git a/swift/megatron/model/register.py b/swift/megatron/model/register.py
@@ -24,14 +24,16 @@ class MegatronModelMeta:
     model_groups: List[ModelGroup] = field(default_factory=list)
 
 
-def register_megatron_model(model_meta: MegatronModelMeta, *, exist_ok: bool = False):
-    megatron_model_type = model_meta.megatron_model_type
-    for model_type in model_meta.model_types:
-        model_meta.model_groups += MODEL_MAPPING[model_type].model_groups
+def register_megatron_model(megatron_model_meta: MegatronModelMeta, *, exist_ok: bool = False):
+    megatron_model_type = megatron_model_meta.megatron_model_type
+    for model_type in megatron_model_meta.model_types:
+        model_meta = MODEL_MAPPING[model_type]
+        model_meta.support_megatron = True
+        megatron_model_meta.model_groups += model_meta.model_groups
     if not exist_ok and megatron_model_type in MEGATRON_MODEL_MAPPING:
         raise ValueError(f'The `{megatron_model_type}` has already been registered in the MODEL_MAPPING.')
 
-    MEGATRON_MODEL_MAPPING[megatron_model_type] = model_meta
+    MEGATRON_MODEL_MAPPING[megatron_model_type] = megatron_model_meta
 
 
 def get_megatron_model_meta(model_id_or_path: str) -> Optional[MegatronModelMeta]:
diff --git a/swift/megatron/utils/convert.py b/swift/megatron/utils/convert.py
@@ -62,6 +62,7 @@ def convert_hf2mcore(args: ExportArguments) -> None:
     kwargs = args.get_model_kwargs()
     hf_model, processor = get_model_tokenizer(**kwargs)
     megatron_model_meta = get_megatron_model_meta(args.model)
+    assert megatron_model_meta is not None, f'Model: {args.model} is not supported.'
     kwargs = megatron_model_meta.convert_hf_config(processor.model_info.config)
     megatron_args = MegatronArguments(**kwargs, **convert_kwargs, save=args.output_dir)
     patch_megatron_tokenizer(processor)
@@ -83,6 +84,7 @@ def convert_mcore2hf(args: ExportArguments) -> None:
     kwargs = args.get_model_kwargs()
     hf_model, processor = get_model_tokenizer(**kwargs)
     megatron_model_meta = get_megatron_model_meta(args.model)
+    assert megatron_model_meta is not None, f'Model: {args.model} is not supported.'
     kwargs = megatron_model_meta.convert_hf_config(processor.model_info.config)
     megatron_args = MegatronArguments(**kwargs, **convert_kwargs, load=args.mcore_model)
     patch_megatron_tokenizer(processor)
diff --git a/tests/megatron/test_align/test_llm.py b/tests/megatron/test_align/test_llm.py
@@ -0,0 +1,47 @@
+import os
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def _test_model(model_id):
+    from swift.llm import export_main, ExportArguments
+    export_main(ExportArguments(model=model_id, to_mcore=True, exist_ok=True, test_convert_precision=True))
+
+
+def test_llama2():
+    _test_model('modelscope/Llama-2-7b-chat-ms')
+
+
+def test_llama3():
+    _test_model('LLM-Research/Meta-Llama-3-8B-Instruct')
+
+
+def test_marco_o1():
+    _test_model('AIDC-AI/Marco-o1')
+
+
+def test_deepseek_r1_llama():
+    # TODO: FIX rope
+    _test_model('deepseek-ai/DeepSeek-R1-Distill-Llama-8B')
+
+
+def test_deepseek_r1_qwen():
+    _test_model('deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B')
+
+
+def test_yi():
+    _test_model('01ai/Yi-1.5-6B-Chat')
+
+
+def test_megrez():
+    _test_model('InfiniAI/Megrez-3b-Instruct')
+
+
+if __name__ == '__main__':
+    # test_llama2()
+    # test_llama3()
+    # test_marco_o1()
+    # test_deepseek_r1_llama()
+    # test_deepseek_r1_qwen()
+    # test_yi()
+    test_megrez()