From 0fb03594a39b318e9908ce57202f4097c44af8cf Mon Sep 17 00:00:00 2001 From: "Wang, Yi A" Date: Tue, 21 Nov 2023 17:28:40 -0800 Subject: [PATCH] add dataset disposal of b-mc2/sql-create-context for codegen Signed-off-by: Wang, Yi A --- examples/language-modeling/README.md | 29 ++++++++++++++++++++ examples/language-modeling/run_lora_clm.py | 31 +++++++++++++++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md index dad97020c6..aa0f9bc3d3 100644 --- a/examples/language-modeling/README.md +++ b/examples/language-modeling/README.md @@ -455,6 +455,35 @@ LOWER_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \ --low_cpu_mem_usage True ``` +- Multi-card finetuning of codegen-16B-mono: +```bash +python ../gaudi_spawn.py \ + --world_size 8 --use_mpi run_lora_clm.py \ + --model_name_or_path Salesforce/codegen-16B-mono \ + --dataset_name b-mc2/sql-create-context \ + --bf16 True \ + --output_dir ./finetuned-models/codegen-finetune-on-sql-create-context-hpu8-lora8-bs4 \ + --num_train_epochs 5 \ + --per_device_train_batch_size 4 \ + --per_device_eval_batch_size 4 \ + --evaluation_strategy "no" \ + --save_strategy "no" \ + --learning_rate 1e-4 \ + --logging_steps 1 \ + --dataset_concatenation \ + --do_train \ + --use_habana \ + --use_lazy_mode \ + --throughput_warmup_steps 3 \ + --use_hpu_graphs_for_inference \ + --lora_target_modules "qkv_proj" \ + --lora_rank 8 \ + --do_eval \ + --validation_split_percentage 10 \ + --use_cache False \ + +``` + ## Streaming To use the streaming dataset mode which can be very useful for large datasets, add `--streaming` with `--max_steps` specified in the command line. This is currently supported by `run_mlm.py` and `run_clm.py`. diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py index fa6f0a62ec..a6e88d5ac4 100644 --- a/examples/language-modeling/run_lora_clm.py +++ b/examples/language-modeling/run_lora_clm.py @@ -269,6 +269,13 @@ class FinetuneArguments: ), } +SQL_PROMPT= ( + "You are a text-to-SQL model. Your job is to answer questions about a database. " + "You are given a question and a context regarding one or more tables in the database.\n\n" + "You must output the SQL query that answers the question. The SQL query must be between [SQL] and [/SQL] tags.\n\n" + "### Question: \n{question}\n\n### Context: \n{context}\n\n### Response:" +) + def create_prompts(examples): prompts = {} @@ -283,6 +290,16 @@ def create_prompts(examples): prompts["target"].append(example["output"]) return prompts +def create_sql_prompts(examples): + prompts = {} + prompts["source"] = [] + prompts["target"] = [] + for example in examples: + source = SQL_PROMPT.format_map(example) + prompts["source"].append(source) + prompts["target"].append(example["answer"]) + return prompts + def main(): # See all possible arguments in src/transformers/training_args.py @@ -438,10 +455,18 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) - if data_args.dataset_name == "tatsu-lab/alpaca": + for key in raw_datasets: + # if alpaca dataset and sql dataset pass as json file, make sure they could work + if sorted(list(raw_datasets[key].features.keys())) == sorted(['input', 'output', 'instruction']): + data_args.dataset_name = "tatsu-lab/alpaca" + if sorted(list(raw_datasets[key].features.keys())) == sorted(['question', 'context', 'answer']): + data_args.dataset_name = "b-mc2/sql-create-context" + + if data_args.dataset_name in ["tatsu-lab/alpaca","b-mc2/sql-create-context"]: # Preprocessing the datasets. + is_alpaca = data_args.dataset_name == "tatsu-lab/alpaca" for key in raw_datasets: - prompts = create_prompts(raw_datasets[key]) + prompts = create_prompts(raw_datasets[key]) if is_alpaca else create_sql_prompts(raw_datasets[key]) columns_to_be_removed = list(raw_datasets[key].features.keys()) raw_datasets[key] = raw_datasets[key].add_column("prompt_sources", prompts["source"]) raw_datasets[key] = raw_datasets[key].add_column("prompt_targets", prompts["target"]) @@ -558,7 +583,7 @@ def concatenate_data(dataset, max_seq_length): concatenated_dataset[column] = reshaped_data return datasets.Dataset.from_dict(concatenated_dataset) - if data_args.dataset_name == "tatsu-lab/alpaca": + if data_args.dataset_name in ["tatsu-lab/alpaca","b-mc2/sql-create-context"]: tokenized_datasets_ = tokenized_datasets["train"].remove_columns(["prompt_sources", "prompt_targets"]) if training_args.do_eval: tokenized_datasets_eval_ = tokenized_datasets["validation"].remove_columns(