Skip to content

Commit

Permalink
update fpt scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Spico197 committed Jul 27, 2023
1 parent fd65831 commit deacfcb
Show file tree
Hide file tree
Showing 11 changed files with 631 additions and 84 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

## 🌴 Dependencies

- Python >= 3.10
- Python >= 3.11
- scikit-learn>=1.3.0
- omegaconf>=2.0.6
- tqdm>=4.65.0
Expand Down Expand Up @@ -33,6 +33,12 @@

```bash
$ git clone [email protected]:pjlab-sys4nlp/train-moe.git
$ cd train-moe
$ pip install -e .[dev]
$ pre-commit install
```

## 🔗 Experiments

- CPT
- [MoEfication L2-norm 8选4 继续预训练实验](https://m04hsypyylv.feishu.cn/docx/R9Tid61U0oOuQ4xwrbGcyCyvnMf)
7 changes: 4 additions & 3 deletions scripts/cpt/fpt.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/bash

#SBATCH --job-name=cpt-moe-fpt-bs8
#SBATCH --job-name=cpt-moe-fpt-bs1-debug
#SBATCH --partition=MoE
#SBATCH --output=logs/%x-%j.log
#SBATCH --error=logs/%x-%j.log
Expand All @@ -18,6 +18,8 @@ num_gpu_per_node=8 # should match with --gres

# #cpu/#num_gpu_per_node
export OMP_NUM_THREADS=1
export NCCL_DEBUG=INFO
export LOGLEVEL=INFO

lr=1e-4

Expand All @@ -29,7 +31,7 @@ pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B_MoE_16Select4
tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
dataset_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed

per_device_train_batch_size=8
per_device_train_batch_size=1
per_device_eval_batch_size=1
gradient_accumulation_steps=1
block_size=2048
Expand All @@ -53,7 +55,6 @@ head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
echo "Node: $head_node"
echo "Node IP: $head_node_ip"
export LOGLEVEL=INFO

srun torchrun \
--nnodes ${num_nodes} \
Expand Down
3 changes: 2 additions & 1 deletion smoe/data/llama_moefication_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def __init__(
"""numthreads should be set <=1, otherwise it will slow down the reading process by ~4 times"""
if num_threads > 1:
warnings.warn(
"num_threads should be set <=1, otherwise it will slow down the reading process by ~4 times!"
"num_threads should be set <=1, otherwise it will slow down the reading"
" process by ~4 times!"
)

if os.path.isfile(file_path) is False:
Expand Down
59 changes: 37 additions & 22 deletions smoe/entrypoint/cpt_fpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
parse_args,
)
from smoe.utils.logging import get_logger_from_training_args
from smoe.utils.param import get_trainable_parameters

MODEL_MAP = {
"llama": LlamaForCausalLM,
Expand Down Expand Up @@ -72,15 +73,16 @@ def main():
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
f"Output directory ({training_args.output_dir}) already exists and is"
" not empty. Use --overwrite_output_dir to overcome."
)
elif (
last_checkpoint is not None and training_args.resume_from_checkpoint is None
):
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid"
" this behavior, change the `--output_dir` or add"
" `--overwrite_output_dir` to train from scratch."
)

# Set seed before initializing model.
Expand Down Expand Up @@ -128,25 +130,28 @@ def main():
)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
"You are instantiating a new tokenizer from scratch. This is not supported"
" by this script.You can do it from another script, save it, and load it"
" from here, using --tokenizer_name."
)

# Preprocessing the datasets.
if data_args.block_size is None:
block_size = tokenizer.model_max_length
if block_size > 1024:
logger.warning(
"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
"The chosen tokenizer supports a `model_max_length` that is longer than"
" the default `block_size` value of 1024. If you would like to use a"
" longer `block_size` up to `tokenizer.model_max_length` you can"
" override this default with `--block_size xxx`."
)
block_size = 1024
else:
if data_args.block_size > tokenizer.model_max_length:
logger.warning(
f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
f"The block_size passed ({data_args.block_size}) is larger than the"
f" maximum length for the model({tokenizer.model_max_length}). Using"
f" block_size={tokenizer.model_max_length}."
)
block_size = min(data_args.block_size, tokenizer.model_max_length)

Expand Down Expand Up @@ -200,11 +205,14 @@ def main():
torch_dtype=torch_dtype,
low_cpu_mem_usage=True,
)
for name, param in model.named_parameters():
if "weight_noise.weight" in name:
nn.init.zeros_(param)
model.change_moe_gate_add_noise(False)
model.change_moe_gate_use_balance(False)
# train an MoE model from scratch 👇
# model: LlamaMoEForCausalLM = LlamaMoEForCausalLM(config)
if isinstance(model, LlamaMoEForCausalLM):
for name, param in model.named_parameters():
if "weight_noise.weight" in name:
nn.init.zeros_(param)
model.change_moe_gate_add_noise(False)
model.change_moe_gate_use_balance(False)
replace_xformers(model)
else:
model = AutoModelForCausalLM.from_config(config)
Expand All @@ -217,9 +225,12 @@ def main():
if model_vocab_size != len(tokenizer):
model.resize_token_embeddings(len(tokenizer))
raise ValueError(
f"The model's vocab size ({model_vocab_size}) does not match with the tokenizer ({len(tokenizer)})"
f"The model's vocab size ({model_vocab_size}) does not match with the"
f" tokenizer ({len(tokenizer)})"
)

get_trainable_parameters(model, verbose=True)

# Initialize our Trainer
trainer = LlamaLrSchedulingTrainer(
model=model,
Expand All @@ -228,12 +239,16 @@ def main():
eval_dataset=eval_dataset if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=fault_tolerance_data_collator,
compute_metrics=compute_metrics
if training_args.do_eval and not is_torch_tpu_available()
else None,
preprocess_logits_for_metrics=logits_argmax
if training_args.do_eval and not is_torch_tpu_available()
else None,
compute_metrics=(
compute_metrics
if training_args.do_eval and not is_torch_tpu_available()
else None
),
preprocess_logits_for_metrics=(
logits_argmax
if training_args.do_eval and not is_torch_tpu_available()
else None
),
)
trainer.add_callback(SaveModelCallback)
# Training
Expand Down
46 changes: 28 additions & 18 deletions smoe/entrypoint/cpt_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
parse_args,
)
from smoe.utils.logging import get_logger_from_training_args
from smoe.utils.param import get_trainable_parameters

MODEL_MAP = {
"llama": LlamaForCausalLM,
Expand Down Expand Up @@ -73,15 +74,16 @@ def main():
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
f"Output directory ({training_args.output_dir}) already exists and is"
" not empty. Use --overwrite_output_dir to overcome."
)
elif (
last_checkpoint is not None and training_args.resume_from_checkpoint is None
):
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid"
" this behavior, change the `--output_dir` or add"
" `--overwrite_output_dir` to train from scratch."
)

# Set seed before initializing model.
Expand Down Expand Up @@ -129,25 +131,28 @@ def main():
)
else:
raise ValueError(
"You are instantiating a new tokenizer from scratch. This is not supported by this script."
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
"You are instantiating a new tokenizer from scratch. This is not supported"
" by this script.You can do it from another script, save it, and load it"
" from here, using --tokenizer_name."
)

# Preprocessing the datasets.
if data_args.block_size is None:
block_size = tokenizer.model_max_length
if block_size > 1024:
logger.warning(
"The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value"
" of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can"
"The chosen tokenizer supports a `model_max_length` that is longer than"
" the default `block_size` value of 1024. If you would like to use a"
" longer `block_size` up to `tokenizer.model_max_length` you can"
" override this default with `--block_size xxx`."
)
block_size = 1024
else:
if data_args.block_size > tokenizer.model_max_length:
logger.warning(
f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
f"The block_size passed ({data_args.block_size}) is larger than the"
f" maximum length for the model({tokenizer.model_max_length}). Using"
f" block_size={tokenizer.model_max_length}."
)
block_size = min(data_args.block_size, tokenizer.model_max_length)

Expand Down Expand Up @@ -221,7 +226,8 @@ def main():
if model_vocab_size != len(tokenizer):
model.resize_token_embeddings(len(tokenizer))
raise ValueError(
f"The model's vocab size ({model_vocab_size}) does not match with the tokenizer ({len(tokenizer)})"
f"The model's vocab size ({model_vocab_size}) does not match with the"
f" tokenizer ({len(tokenizer)})"
)
if training_args.peft_path is not None:
logger.info("Peft from pre-trained model")
Expand Down Expand Up @@ -258,7 +264,7 @@ def make_inputs_require_grad(module, input, output):

model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
get_trainable_parameters(model, verbose=True)

# Initialize our Trainer
trainer = LlamaLrSchedulingTrainer(
Expand All @@ -268,12 +274,16 @@ def make_inputs_require_grad(module, input, output):
eval_dataset=eval_dataset if training_args.do_eval else None,
tokenizer=tokenizer,
data_collator=fault_tolerance_data_collator,
compute_metrics=compute_metrics
if training_args.do_eval and not is_torch_tpu_available()
else None,
preprocess_logits_for_metrics=logits_argmax
if training_args.do_eval and not is_torch_tpu_available()
else None,
compute_metrics=(
compute_metrics
if training_args.do_eval and not is_torch_tpu_available()
else None
),
preprocess_logits_for_metrics=(
logits_argmax
if training_args.do_eval and not is_torch_tpu_available()
else None
),
)
trainer.add_callback(SaveModelCallback)
# Training
Expand Down
5 changes: 4 additions & 1 deletion smoe/entrypoint/moefication/llama_split_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
"--templates",
type=str,
default="layers.{}.mlp.gate_proj.weight",
help="weight names of the first linear layer in each FFN (use comma to separate multiple templates)",
help=(
"weight names of the first linear layer in each FFN (use comma to separate"
" multiple templates)"
),
)
parser.add_argument("--num_experts", type=int, default=8, help="number of experts")

Expand Down
14 changes: 9 additions & 5 deletions smoe/models/llama_moefication/modeling_llama_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ def __init__(self, config: LlamaMoEConfig, layer_index):
hidden_act=config.hidden_act,
num_experts=config.num_experts,
num_selects=config.num_selects,
size_experts=config.size_experts[layer_index]
if config.size_experts is not None
else None,
size_experts=(
config.size_experts[layer_index]
if config.size_experts is not None
else None
),
bias=False,
gate_network=config.gates,
gate_use_balance=True,
Expand Down Expand Up @@ -145,7 +147,8 @@ def forward(
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError(
"You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
"You cannot specify both decoder_input_ids and decoder_inputs_embeds at"
" the same time"
)
elif input_ids is not None:
batch_size, seq_length = input_ids.shape
Expand Down Expand Up @@ -197,7 +200,8 @@ def forward(
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
"`use_cache=True` is incompatible with gradient checkpointing."
" Setting `use_cache=False`..."
)
use_cache = False

Expand Down
19 changes: 11 additions & 8 deletions smoe/modules/moefication/moe_experts.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,12 +179,15 @@ def forward(self, input, i):
return down

def extra_repr(self):
return "in_features={}, hidden_features={}, out_features={}, hidden_act={}, num_experts={}, size_experts={}, bias={}".format(
self.in_features,
self.hidden_features,
self.out_features,
self.hidden_act,
self.num_experts,
self.size_experts,
self.bias_gate is not None,
return (
"in_features={}, hidden_features={}, out_features={}, hidden_act={},"
" num_experts={}, size_experts={}, bias={}".format(
self.in_features,
self.hidden_features,
self.out_features,
self.hidden_act,
self.num_experts,
self.size_experts,
self.bias_gate is not None,
)
)
Loading

0 comments on commit deacfcb

Please sign in to comment.