update fpt scripts

pjlab-sys4nlp · Jul 27, 2023 · deacfcb · deacfcb
1 parent fd65831
commit deacfcb
Show file tree

Hide file tree

Showing 11 changed files with 631 additions and 84 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 ## 🌴 Dependencies
 
-- Python >= 3.10
+- Python >= 3.11
     - scikit-learn>=1.3.0
     - omegaconf>=2.0.6
     - tqdm>=4.65.0
@@ -33,6 +33,12 @@
 
 ```bash
 $ git clone [email protected]:pjlab-sys4nlp/train-moe.git
+$ cd train-moe
 $ pip install -e .[dev]
 $ pre-commit install
 ```
+
+## 🔗 Experiments
+
+- CPT
+  - [MoEfication L2-norm 8选4 继续预训练实验](https://m04hsypyylv.feishu.cn/docx/R9Tid61U0oOuQ4xwrbGcyCyvnMf)
diff --git a/scripts/cpt/fpt.sh b/scripts/cpt/fpt.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/bash
 
-#SBATCH --job-name=cpt-moe-fpt-bs8
+#SBATCH --job-name=cpt-moe-fpt-bs1-debug
 #SBATCH --partition=MoE
 #SBATCH --output=logs/%x-%j.log
 #SBATCH --error=logs/%x-%j.log
@@ -18,6 +18,8 @@ num_gpu_per_node=8  # should match with --gres
 
 # #cpu/#num_gpu_per_node
 export OMP_NUM_THREADS=1
+export NCCL_DEBUG=INFO
+export LOGLEVEL=INFO
 
 lr=1e-4
 
@@ -29,7 +31,7 @@ pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4
 tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
 dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
 
-per_device_train_batch_size=8
+per_device_train_batch_size=1
 per_device_eval_batch_size=1
 gradient_accumulation_steps=1
 block_size=2048
@@ -53,7 +55,6 @@ head_node=${nodes_array[0]}
 head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
 echo "Node: $head_node"
 echo "Node IP: $head_node_ip"
-export LOGLEVEL=INFO
 
 srun torchrun \
     --nnodes ${num_nodes} \

diff --git a/smoe/data/llama_moefication_datasets.py b/smoe/data/llama_moefication_datasets.py
@@ -21,7 +21,8 @@ def __init__(
         """numthreads should be set <=1, otherwise it will slow down the reading process by ~4 times"""
         if num_threads > 1:
             warnings.warn(
-                "num_threads should be set <=1, otherwise it will slow down the reading process by ~4 times!"
+                "num_threads should be set <=1, otherwise it will slow down the reading"
+                " process by ~4 times!"
             )
 
         if os.path.isfile(file_path) is False:

diff --git a/smoe/entrypoint/cpt_fpt.py b/smoe/entrypoint/cpt_fpt.py
@@ -31,6 +31,7 @@
     parse_args,
 )
 from smoe.utils.logging import get_logger_from_training_args
+from smoe.utils.param import get_trainable_parameters
 
 MODEL_MAP = {
     "llama": LlamaForCausalLM,
@@ -72,15 +73,16 @@ def main():
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
+                f"Output directory ({training_args.output_dir}) already exists and is"
+                " not empty. Use --overwrite_output_dir to overcome."
             )
         elif (
             last_checkpoint is not None and training_args.resume_from_checkpoint is None
         ):
             logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid"
+                " this behavior, change the `--output_dir` or add"
+                " `--overwrite_output_dir` to train from scratch."
             )
 
     # Set seed before initializing model.
@@ -128,25 +130,28 @@ def main():
         )
     else:
         raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+            "You are instantiating a new tokenizer from scratch. This is not supported"
+            " by this script.You can do it from another script, save it, and load it"
+            " from here, using --tokenizer_name."
         )
 
     # Preprocessing the datasets.
     if data_args.block_size is None:
         block_size = tokenizer.model_max_length
         if block_size > 1024:
             logger.warning(
-                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
-                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                "The chosen tokenizer supports a `model_max_length` that is longer than"
+                " the default `block_size` value of 1024. If you would like to use a"
+                " longer `block_size` up to `tokenizer.model_max_length` you can"
                 " override this default with `--block_size xxx`."
             )
             block_size = 1024
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
-                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
-                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
+                f"The block_size passed ({data_args.block_size}) is larger than the"
+                f" maximum length for the model({tokenizer.model_max_length}). Using"
+                f" block_size={tokenizer.model_max_length}."
             )
         block_size = min(data_args.block_size, tokenizer.model_max_length)
 
@@ -200,11 +205,14 @@ def main():
             torch_dtype=torch_dtype,
             low_cpu_mem_usage=True,
         )
-        for name, param in model.named_parameters():
-            if "weight_noise.weight" in name:
-                nn.init.zeros_(param)
-        model.change_moe_gate_add_noise(False)
-        model.change_moe_gate_use_balance(False)
+        # train an MoE model from scratch 👇
+        # model: LlamaMoEForCausalLM = LlamaMoEForCausalLM(config)
+        if isinstance(model, LlamaMoEForCausalLM):
+            for name, param in model.named_parameters():
+                if "weight_noise.weight" in name:
+                    nn.init.zeros_(param)
+            model.change_moe_gate_add_noise(False)
+            model.change_moe_gate_use_balance(False)
         replace_xformers(model)
     else:
         model = AutoModelForCausalLM.from_config(config)
@@ -217,9 +225,12 @@ def main():
     if model_vocab_size != len(tokenizer):
         model.resize_token_embeddings(len(tokenizer))
         raise ValueError(
-            f"The model's vocab size ({model_vocab_size}) does not match with the tokenizer ({len(tokenizer)})"
+            f"The model's vocab size ({model_vocab_size}) does not match with the"
+            f" tokenizer ({len(tokenizer)})"
         )
 
+    get_trainable_parameters(model, verbose=True)
+
     # Initialize our Trainer
     trainer = LlamaLrSchedulingTrainer(
         model=model,
@@ -228,12 +239,16 @@ def main():
         eval_dataset=eval_dataset if training_args.do_eval else None,
         tokenizer=tokenizer,
         data_collator=fault_tolerance_data_collator,
-        compute_metrics=compute_metrics
-        if training_args.do_eval and not is_torch_tpu_available()
-        else None,
-        preprocess_logits_for_metrics=logits_argmax
-        if training_args.do_eval and not is_torch_tpu_available()
-        else None,
+        compute_metrics=(
+            compute_metrics
+            if training_args.do_eval and not is_torch_tpu_available()
+            else None
+        ),
+        preprocess_logits_for_metrics=(
+            logits_argmax
+            if training_args.do_eval and not is_torch_tpu_available()
+            else None
+        ),
     )
     trainer.add_callback(SaveModelCallback)
     # Training

diff --git a/smoe/entrypoint/cpt_lora.py b/smoe/entrypoint/cpt_lora.py
@@ -32,6 +32,7 @@
     parse_args,
 )
 from smoe.utils.logging import get_logger_from_training_args
+from smoe.utils.param import get_trainable_parameters
 
 MODEL_MAP = {
     "llama": LlamaForCausalLM,
@@ -73,15 +74,16 @@ def main():
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
             raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
+                f"Output directory ({training_args.output_dir}) already exists and is"
+                " not empty. Use --overwrite_output_dir to overcome."
             )
         elif (
             last_checkpoint is not None and training_args.resume_from_checkpoint is None
         ):
             logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid"
+                " this behavior, change the `--output_dir` or add"
+                " `--overwrite_output_dir` to train from scratch."
             )
 
     # Set seed before initializing model.
@@ -129,25 +131,28 @@ def main():
         )
     else:
         raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
-            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+            "You are instantiating a new tokenizer from scratch. This is not supported"
+            " by this script.You can do it from another script, save it, and load it"
+            " from here, using --tokenizer_name."
         )
 
     # Preprocessing the datasets.
     if data_args.block_size is None:
         block_size = tokenizer.model_max_length
         if block_size > 1024:
             logger.warning(
-                "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
-                " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
+                "The chosen tokenizer supports a `model_max_length` that is longer than"
+                " the default `block_size` value of 1024. If you would like to use a"
+                " longer `block_size` up to `tokenizer.model_max_length` you can"
                 " override this default with `--block_size xxx`."
             )
             block_size = 1024
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
-                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
-                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
+                f"The block_size passed ({data_args.block_size}) is larger than the"
+                f" maximum length for the model({tokenizer.model_max_length}). Using"
+                f" block_size={tokenizer.model_max_length}."
             )
         block_size = min(data_args.block_size, tokenizer.model_max_length)
 
@@ -221,7 +226,8 @@ def main():
     if model_vocab_size != len(tokenizer):
         model.resize_token_embeddings(len(tokenizer))
         raise ValueError(
-            f"The model's vocab size ({model_vocab_size}) does not match with the tokenizer ({len(tokenizer)})"
+            f"The model's vocab size ({model_vocab_size}) does not match with the"
+            f" tokenizer ({len(tokenizer)})"
         )
     if training_args.peft_path is not None:
         logger.info("Peft from pre-trained model")
@@ -258,7 +264,7 @@ def make_inputs_require_grad(module, input, output):
 
             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
         model = get_peft_model(model, peft_config)
-    model.print_trainable_parameters()
+    get_trainable_parameters(model, verbose=True)
 
     # Initialize our Trainer
     trainer = LlamaLrSchedulingTrainer(
@@ -268,12 +274,16 @@ def make_inputs_require_grad(module, input, output):
         eval_dataset=eval_dataset if training_args.do_eval else None,
         tokenizer=tokenizer,
         data_collator=fault_tolerance_data_collator,
-        compute_metrics=compute_metrics
-        if training_args.do_eval and not is_torch_tpu_available()
-        else None,
-        preprocess_logits_for_metrics=logits_argmax
-        if training_args.do_eval and not is_torch_tpu_available()
-        else None,
+        compute_metrics=(
+            compute_metrics
+            if training_args.do_eval and not is_torch_tpu_available()
+            else None
+        ),
+        preprocess_logits_for_metrics=(
+            logits_argmax
+            if training_args.do_eval and not is_torch_tpu_available()
+            else None
+        ),
     )
     trainer.add_callback(SaveModelCallback)
     # Training

diff --git a/smoe/entrypoint/moefication/llama_split_clustering.py b/smoe/entrypoint/moefication/llama_split_clustering.py
@@ -19,7 +19,10 @@
     "--templates",
     type=str,
     default="layers.{}.mlp.gate_proj.weight",
-    help="weight names of the first linear layer in each FFN (use comma to separate multiple templates)",
+    help=(
+        "weight names of the first linear layer in each FFN (use comma to separate"
+        " multiple templates)"
+    ),
 )
 parser.add_argument("--num_experts", type=int, default=8, help="number of experts")
 

diff --git a/smoe/models/llama_moefication/modeling_llama_moe.py b/smoe/models/llama_moefication/modeling_llama_moe.py
@@ -36,9 +36,11 @@ def __init__(self, config: LlamaMoEConfig, layer_index):
             hidden_act=config.hidden_act,
             num_experts=config.num_experts,
             num_selects=config.num_selects,
-            size_experts=config.size_experts[layer_index]
-            if config.size_experts is not None
-            else None,
+            size_experts=(
+                config.size_experts[layer_index]
+                if config.size_experts is not None
+                else None
+            ),
             bias=False,
             gate_network=config.gates,
             gate_use_balance=True,
@@ -145,7 +147,8 @@ def forward(
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at"
+                " the same time"
             )
         elif input_ids is not None:
             batch_size, seq_length = input_ids.shape
@@ -197,7 +200,8 @@ def forward(
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    "`use_cache=True` is incompatible with gradient checkpointing."
+                    " Setting `use_cache=False`..."
                 )
                 use_cache = False
 

diff --git a/smoe/modules/moefication/moe_experts.py b/smoe/modules/moefication/moe_experts.py
@@ -179,12 +179,15 @@ def forward(self, input, i):
         return down
 
     def extra_repr(self):
-        return "in_features={}, hidden_features={}, out_features={}, hidden_act={}, num_experts={}, size_experts={}, bias={}".format(
-            self.in_features,
-            self.hidden_features,
-            self.out_features,
-            self.hidden_act,
-            self.num_experts,
-            self.size_experts,
-            self.bias_gate is not None,
+        return (
+            "in_features={}, hidden_features={}, out_features={}, hidden_act={},"
+            " num_experts={}, size_experts={}, bias={}".format(
+                self.in_features,
+                self.hidden_features,
+                self.out_features,
+                self.hidden_act,
+                self.num_experts,
+                self.size_experts,
+                self.bias_gate is not None,
+            )
         )