LLM Finetune Functionality (#277)

* initial llmfoundry finetune functionality * update docstring with samples * fix quality * refactor to use TaskRunner; add ddp support * add trainhook for single gpu/cpu run * add enum for llm datatypes, use task info for finetune pathways, update docstring * add click for arguments, add finetune args, update entrypoints * add try/except around imports * quality * PR comments * LLM finetune sparsify masking (#278) * add functions to mask weights during finetuneing * update logic for loading weights * update yaml * update mask name * add logic to update batchsize based on gpu count * make sparsify requirements less broad; move sparseml[transformers] to nm deps * remove flash-attn * quality
neuralmagic · Aug 24, 2023 · 7d5b20d · 7d5b20d
1 parent e3133b8
commit 7d5b20d
Show file tree

Hide file tree

Showing 14 changed files with 766 additions and 16 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,4 @@
 recursive-include src/sparsify/ui/ *
 include LICENSE
 include src/sparsify/auto/tasks/deployment_instructions.md
+include src/sparsify/auto/samples/finetune_llmfoundry_sample.yaml
diff --git a/setup.py b/setup.py
@@ -26,11 +26,10 @@
 # load and overwrite version and release info from sparseml package
 exec(open(os.path.join("src", "sparsify", "version.py")).read())
 print(f"loaded version {version} from src/sparsify/version.py")
-version_nm_deps = f"{version_major_minor}.0"
+version_nm_deps = f"{version_major_minor}.0.202308"
 
 _PACKAGE_NAME = "sparsify" if is_release else "sparsify-nightly"
 
-
 _deps = [
     "pydantic>=1.8.2,<2.0.0",
     "pyyaml>=5.0.0",
@@ -39,13 +38,14 @@
     "setuptools>=56.0.0",
     "optuna>=3.0.2",
     "onnxruntime-gpu",
-]
-_nm_deps = [
     f"{'sparsezoo' if is_release else 'sparsezoo-nightly'}~={version_nm_deps}",
-    f"{'sparseml' if is_release else 'sparseml-nightly'}[torchvision,transformers,yolov5]~={version_nm_deps}",  # noqa E501
     f"{'deepsparse' if is_release else 'deepsparse-nightly'}~={version_nm_deps}",
+    f"{'sparseml' if is_release else 'sparseml-nightly'}[torchvision,yolov5]~={version_nm_deps}",  # noqa E501
 ]
 
+_nm_deps = [
+    f"{'sparseml' if is_release else 'sparseml-nightly'}[transformers]~={version_nm_deps}",  # noqa E501
+]
 
 _dev_deps = [
     "black>=20.8b1",
@@ -56,6 +56,11 @@
     "fastai>=2.7.7",
 ]
 
+_llm_deps = [
+    "llm-foundry==0.2.0",
+    f"{'nm-transformers' if is_release else 'nm-transformers-nightly'}",
+]
+
 
 def _setup_packages() -> List:
     return find_packages(
@@ -68,11 +73,11 @@ def _setup_package_dir() -> Dict:
 
 
 def _setup_install_requires() -> List:
-    return _nm_deps + _deps
+    return _deps
 
 
 def _setup_extras() -> Dict:
-    return {"dev": _dev_deps}
+    return {"dev": _dev_deps, "_nm_deps": _nm_deps, "llm": _llm_deps}
 
 
 def _setup_entry_points() -> Dict:
@@ -81,6 +86,7 @@ def _setup_entry_points() -> Dict:
             "sparsify.run=sparsify.cli.run:main",
             "sparsify.login=sparsify.login:main",
             "sparsify.check_environment=sparsify.check_environment.main:main",
+            "finetune=sparsify.auto.tasks.finetune.finetune:parse_args_and_run",
         ]
     }
 

diff --git a/src/sparsify/auto/samples/finetune_llmfoundry_sample.yaml b/src/sparsify/auto/samples/finetune_llmfoundry_sample.yaml
@@ -0,0 +1,134 @@
+max_seq_len: 2048
+global_seed: 17
+model_name_or_path: mosaicml/mpt-7b-instruct
+load_path:  /storage/dsikka/mpt_7b_instruct_oneshot_sp70.pt
+precision: amp_bf16
+
+max_duration: 1ep
+eval_interval: 1ep
+# eval_subset_num_batches: 3  # use this for quick testing
+eval_first: true
+seed: ${global_seed}
+
+global_train_batch_size: 1
+# for mpt-7b dense:
+# 4 x A100_80GB = "device_train_microbatch_size: 12"
+# 8 x A6000_48GB = "device_train_microbatch_size: 6"
+
+# for mpt-7b sparse (with masks):
+# 8 x A6000_48GB = "device_train_microbatch_size: 4"
+device_train_batch_size: 1
+device_train_microbatch_size: 1
+device_eval_batch_size: 1
+
+# Run Name
+run_name: test_run
+
+model:
+  name: hf_causal_lm
+  pretrained: true
+  pretrained_model_name_or_path: mosaicml/mpt-7b-instruct
+  max_seq_len: ${max_seq_len}
+  config_overrides:
+    attn_config:
+      attn_impl: torch
+      # Set this to `true` if using `train_loader.dataset.packing_ratio` below
+      attn_uses_sequence_id: true
+
+# Tokenizer
+tokenizer:
+  name: EleutherAI/gpt-neox-20b
+  kwargs:
+    model_max_length: ${max_seq_len}
+
+# Dataloaders
+train_loader:
+  name: finetuning
+  dataset:
+    hf_name: mosaicml/dolly_hhrlhf
+    split: train
+    max_seq_len: ${max_seq_len}
+    allow_pad_trimming: false
+    decoder_only_format: true
+    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
+    # # to profile this run's optimal packing_ratio as it depends on GPU count,
+    # # batch size, sequence length
+    packing_ratio: 13 # padding=0.36%, waste=0.79%
+    shuffle: true
+  drop_last: false
+  num_workers: 8
+  pin_memory: false
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+eval_loader:
+  name: finetuning
+  dataset:
+    hf_name: mosaicml/dolly_hhrlhf
+    split: test
+    max_seq_len: ${max_seq_len}
+    allow_pad_trimming: false
+    decoder_only_format: true
+    packing_ratio: 13
+    shuffle: false
+  drop_last: false
+  num_workers: 8
+  pin_memory: false
+  prefetch_factor: 2
+  persistent_workers: true
+  timeout: 0
+
+# Optimization
+scheduler:
+  name: linear_decay_with_warmup
+  t_warmup: 20ba
+  alpha_f: 0
+
+optimizer:
+  name: decoupled_adamw
+  lr: 1e-4
+  betas:
+  - 0.9
+  - 0.999
+  eps: 1.0e-8
+  weight_decay: 0.0
+
+# we can't use gradient clipping for sparse training runs because we don't have
+# a way to mask gradients of pruned weights, and thus the global gradient norm
+# will be incorrect
+# algorithms:
+#   gradient_clipping:
+#     clipping_type: norm
+#     clipping_threshold: 1.0
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  mixed_precision: FULL
+  activation_checkpointing: true
+  activation_checkpointing_reentrant: false
+  activation_cpu_offload: false
+  limit_all_gathers: true
+  verbose: false
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 1ba
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+  runtime_estimator: {}
+
+loggers:
+  tensorboard: {}
+
+# Checkpoint to local filesystem or remote object store
+save_interval: 1ep
+save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+save_folder: output_dir/{run_name}/checkpoints
+save_overwrite: true
diff --git a/src/sparsify/auto/scripts/main.py b/src/sparsify/auto/scripts/main.py
@@ -25,6 +25,7 @@
 )
 from sparsify.schemas import APIArgs
 from sparsify.schemas.auto_api import SparsificationTrainingConfig
+from sparsify.utils import get_task_info
 from tensorboard.program import TensorBoard
 from tensorboard.util import tb_logging
 
@@ -42,6 +43,18 @@ def main(api_args: APIArgs):
         deploy_directory,
     ) = create_save_directory(api_args)
 
+    if api_args.task in get_task_info("finetune").aliases:
+        _LOGGER.info(
+            "Running finetuning. "
+            "Currently only arguments passed for use-case and data will be considered"
+        )
+        config = SparsificationTrainingConfig(
+            task=api_args.task, dataset=api_args.dataset, base_model=None, recipe=None
+        )
+        runner = TaskRunner.create(config)
+        runner.train(train_directory=train_directory, log_directory=log_directory)
+        return
+
     _suppress_tensorboard_logs()
 
     # Launch tensorboard server
@@ -51,16 +64,17 @@ def main(api_args: APIArgs):
     _LOGGER.info(f"TensorBoard listening on {url}")
 
     # Request config from api and instantiate runner
+
     raw_config = api_request_config(api_args)
     config = SparsificationTrainingConfig(**raw_config)
-    runner = TaskRunner.create(config)
 
+    runner = TaskRunner.create(config)
     # Execute integration run and return metrics
     metrics = runner.train(train_directory=train_directory, log_directory=log_directory)
+
     yaml.safe_dump(
         metrics.dict(), (Path(train_directory).parent / "metrics.yaml").open("w")
     )
-
     runner.export(model_directory=train_directory)
     runner.create_deployment_directory(
         train_directory=train_directory, deploy_directory=deploy_directory

diff --git a/src/sparsify/auto/tasks/finetune/__init__.py b/src/sparsify/auto/tasks/finetune/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+
+from .args import *
+
+
+try:
+    from .finetune import *
+    from .runner import *
+except ImportError as exception:
+    raise ImportError(
+        "To use the llm finetuning pathway, please install sparsify[llm]"
+    ) from exception
diff --git a/src/sparsify/auto/tasks/finetune/args.py b/src/sparsify/auto/tasks/finetune/args.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pydantic import Field
+from sparsify.auto.tasks import BaseArgs
+
+
+__all__ = ["FineTuneTrainArgs"]
+
+
+class FineTuneTrainArgs(BaseArgs):
+    yaml: str = Field(
+        default=None,
+        description="path to the training yaml",
+    )
+    checkpoints: str = Field(
+        default=None,
+        description="path to the directory to store checkpoints",
+    )
+    logging: str = Field(
+        default=None,
+        description="path to store logs",
+    )