init

pjlab-sys4nlp · Jul 24, 2023 · b38db8c · b38db8c
1 parent 9189aae
commit b38db8c
Show file tree

Hide file tree

Showing 27 changed files with 1,171 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+debug.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+- repo: https://github.com/pycqa/isort
+  rev: 5.12.0
+  hooks:
+    - id: isort
+      name: isort (python)
+      args: ["--profile", "black", "--filter-files"]
+- repo: https://github.com/psf/black
+  rev: 22.12.0
+  hooks:
+    - id: black
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: check-yaml
+  - id: check-added-large-files
diff --git a/Makefile b/Makefile
@@ -0,0 +1,30 @@
+all: format clean pre test
+	echo 'finished'
+
+.PHONY: format
+format:
+	isort --profile black --filter-files .
+	black .
+
+.PHONY: test
+test:
+	coverage run --source rex -m pytest -vv .
+	coverage report -m
+	flake8
+
+.PHONY: pre
+pre:
+	pre-commit run --all-files
+
+.PHONY: debug
+debug:
+	pytest -vv tests/tasks/test_re.py
+
+.PHONY: clean
+clean:
+	rm -rf build/
+	rm -rf dist/
+	rm -rf *.egg-info/
+	rm -f .coverage
+	rm -f coverage.xml
+	find . | grep -E '(__pycache__|\.pyc|\.pyo$$)' | xargs rm -rf
diff --git a/README.md b/README.md
@@ -1 +1,11 @@
-# smoe
+# smoe
+
+## For developers
+
+- Make sure the Python version `>=3.10` (a strict version contraint for better type hinting)
+
+```bash
+$ pip install -e .[dev]
+```
+
+
diff --git a/VERSION b/VERSION
@@ -0,0 +1 @@
+0.0.0
diff --git a/conf/deepspeed/bf16.json b/conf/deepspeed/bf16.json
@@ -0,0 +1,20 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 1e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 1e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/conf/deepspeed/fp16.json b/conf/deepspeed/fp16.json
@@ -0,0 +1,26 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 100,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1e-10
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 1e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 1e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+scikit-learn>=1.3.0
+omegaconf>=2.0.6
+tqdm>=4.65.0
+datasets>=2.13.1
+transformers>=4.30.2
+peft>=0.4.0
diff --git a/scripts/cpt/README.md b/scripts/cpt/README.md
@@ -0,0 +1,4 @@
+# Scripts for Continual Pre-training
+
+- `lora.sh`: Parameter-efficient tuning
+- `fpt.sh`: Full-parameter pretraining
diff --git a/scripts/cpt/fpt.sh b/scripts/cpt/fpt.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/bash
+
+#SBATCH --job-name=cpt-bf16-2nodes-woLora
+#SBATCH --partition=MoE
+#SBATCH --output=logs/%x.log
+#SBATCH --error=logs/%x.log
+
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --gres=gpu:8
+#SBATCH --cpus-per-task=8
+
+source ~/anaconda3/bin/activate torch
+
+lr=2e-4
+
+pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B/
+tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B/
+dataset_dir=resources
+data_cache=temp_data_cache_dir
+per_device_train_batch_size=1
+per_device_eval_batch_size=1
+gradient_accumulation_steps=8
+output_dir=output_dir_cpt_ymcui
+
+deepspeed_config_file=conf/ds_bf16.json
+
+nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIS ) )
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+echo "Node: $head_node"
+echo "Node IP: $head_node_ip"
+export LOGLEVEL=INFO
+
+srun torchrun \
+    --nnodes 2 \
+    --nproc_per_node 8 \
+    --node_rank $SLURM_NODEID \
+    --rdzv_id $RANDOM \
+    --rdzv_backend c10d \
+    --rdzv_endpoint $head_node:29518 \
+    src/entrypoint/run_clm_pt_wo_peft.py \
+        --deepspeed ${deepspeed_config_file} \
+        --model_name_or_path ${pretrained_model} \
+        --tokenizer_name_or_path ${tokenizer_path} \
+        --dataset_dir ${dataset_dir} \
+        --data_cache_dir ${data_cache} \
+        --validation_split_percentage 0.001 \
+        --per_device_train_batch_size ${per_device_train_batch_size} \
+        --per_device_eval_batch_size ${per_device_eval_batch_size} \
+        --do_train \
+        --seed $RANDOM \
+        --bf16 \
+        --num_train_epochs 1 \
+        --lr_scheduler_type cosine \
+        --learning_rate ${lr} \
+        --warmup_ratio 0.05 \
+        --weight_decay 0.01 \
+        --logging_strategy steps \
+        --logging_steps 10 \
+        --save_strategy steps \
+        --save_total_limit 3 \
+        --save_steps 200 \
+        --gradient_accumulation_steps ${gradient_accumulation_steps} \
+        --preprocessing_num_workers 8 \
+        --block_size 512 \
+        --output_dir ${output_dir} \
+        --overwrite_output_dir \
+        --ddp_timeout 30000 \
+        --logging_first_step True \
+        --torch_dtype bfloat16 \
+        --gradient_checkpointing \
+        --ddp_find_unused_parameters False
diff --git a/scripts/cpt/lora.sh b/scripts/cpt/lora.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/bash
+
+#SBATCH --job-name=cpt-lora-bf16-2nodes
+#SBATCH --partition=MoE
+#SBATCH --output=logs/%x.log
+#SBATCH --error=logs/%x.log
+
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=1
+#SBATCH --gres=gpu:8
+#SBATCH --cpus-per-task=8
+
+source ~/anaconda3/bin/activate torch
+
+lr=2e-4
+lora_rank=8
+lora_alpha=32
+lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
+modules_to_save="embed_tokens,lm_head"
+lora_dropout=0.05
+
+pretrained_model=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B/
+tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B/
+dataset_dir=resources
+data_cache=temp_data_cache_dir
+per_device_train_batch_size=1
+per_device_eval_batch_size=1
+gradient_accumulation_steps=8
+output_dir=output_dir
+
+deepspeed_config_file=conf/ds_bf16.json
+
+nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIS ) )
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+echo "Node: $head_node"
+echo "Node IP: $head_node_ip"
+export LOGLEVEL=INFO
+
+srun torchrun \
+    --nnodes 2 \
+    --nproc_per_node 8 \
+    --node_rank $SLURM_NODEID \
+    --rdzv_id $RANDOM \
+    --rdzv_backend c10d \
+    --rdzv_endpoint $head_node:29518 \
+    src/entrypoint/run_clm_pt_with_peft.py \
+        --deepspeed ${deepspeed_config_file} \
+        --model_name_or_path ${pretrained_model} \
+        --tokenizer_name_or_path ${tokenizer_path} \
+        --dataset_dir ${dataset_dir} \
+        --data_cache_dir ${data_cache} \
+        --validation_split_percentage 0.001 \
+        --per_device_train_batch_size ${per_device_train_batch_size} \
+        --per_device_eval_batch_size ${per_device_eval_batch_size} \
+        --do_train \
+        --seed $RANDOM \
+        --bf16 \
+        --num_train_epochs 1 \
+        --lr_scheduler_type cosine \
+        --learning_rate ${lr} \
+        --warmup_ratio 0.05 \
+        --weight_decay 0.01 \
+        --logging_strategy steps \
+        --logging_steps 10 \
+        --save_strategy steps \
+        --save_total_limit 3 \
+        --save_steps 200 \
+        --gradient_accumulation_steps ${gradient_accumulation_steps} \
+        --preprocessing_num_workers 8 \
+        --block_size 512 \
+        --output_dir ${output_dir} \
+        --overwrite_output_dir \
+        --ddp_timeout 30000 \
+        --logging_first_step True \
+        --lora_rank ${lora_rank} \
+        --lora_alpha ${lora_alpha} \
+        --trainable ${lora_trainable} \
+        --modules_to_save ${modules_to_save} \
+        --lora_dropout ${lora_dropout} \
+        --torch_dtype float16 \
+        --gradient_checkpointing \
+        --ddp_find_unused_parameters False
diff --git a/setup.py b/setup.py
@@ -0,0 +1,49 @@
+import os
+
+import setuptools
+
+readme_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md")
+with open(readme_filepath, "r") as fh:
+    long_description = fh.read()
+
+version_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "VERSION")
+with open(version_filepath, "r") as fh:
+    version = fh.read().strip()
+
+setuptools.setup(
+    name="smoe",
+    version=version,
+    author="MoE Group",
+    author_email="[email protected]",
+    description="A toolkit for LLM MoE and continual pretraining.",
+    long_description_content_type="text/markdown",
+    long_description=long_description,
+    url="https://github.com/Spico197/smoe",
+    packages=setuptools.find_packages(exclude=["tests", "tests.*", "docs", "docs.*"]),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.10",
+    install_requires=[
+        "scikit-learn>=1.3.0",
+        "omegaconf>=2.0.6",
+        "tqdm>=4.65.0",
+        "datasets>=2.13.1",
+        "transformers>=4.30.2",
+        "peft>=0.4.0",
+    ],
+    extras_require={
+        "dev": [
+            "pytest",
+            "coverage",
+            "black",
+            "isort",
+            "flake8",
+            "pre-commit",
+        ]
+    },
+    include_package_data=True,
+    entry_points={},
+)
diff --git a/smoe/__init__.py b/smoe/__init__.py
diff --git a/smoe/callbacks/__init__.py b/smoe/callbacks/__init__.py
diff --git a/smoe/callbacks/save_peft_model.py b/smoe/callbacks/save_peft_model.py
@@ -0,0 +1,32 @@
+import os
+
+from transformers import TrainerCallback
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+
+class SavePeftModelCallback(TrainerCallback):
+    def __init__(self, peft_model_subdir: str = "peft_model"):
+        self.peft_model_subdir = peft_model_subdir
+
+    def save_model(self, args, state, **kwargs):
+        if state.best_model_checkpoint is not None:
+            checkpoint_folder = os.path.join(
+                state.best_model_checkpoint, self.peft_model_subdir
+            )
+        else:
+            checkpoint_folder = os.path.join(
+                args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
+            )
+
+        peft_model_path = os.path.join(checkpoint_folder, self.peft_model_subdir)
+        kwargs["model"].save_pretrained(peft_model_path)
+        kwargs["tokenizer"].save_pretrained(peft_model_path)
+
+    def on_save(self, args, state, control, **kwargs):
+        self.save_model(args, state, **kwargs)
+        return control
+
+    def on_train_end(self, args, state, control, **kwargs):
+        peft_model_path = os.path.join(args.output_dir, self.peft_model_subdir)
+        kwargs["model"].save_pretrained(peft_model_path)
+        kwargs["tokenizer"].save_pretrained(peft_model_path)
diff --git a/smoe/data/__init__.py b/smoe/data/__init__.py