Skip to content

Commit 86bf7e0

Browse files
committed
WIP for YuisekinAI
1 parent 81de8a1 commit 86bf7e0

7 files changed

+114
-16
lines changed

Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ eval-all: $(targets)
1818
python3 src/eval.py recipes/RTX_3060_12GB/sql-coder.yaml
1919

2020
output/tinyllama-color-coder-v1/checkpoint-200/README.md:
21-
python3 src/train.py recipes/RTX_3060_12GB/color-coder.yaml
21+
accelerate launch src/train.py recipes/RTX_3060_12GB/color-coder.yaml
2222

2323
output/tinyllama-sql-coder-v1/checkpoint-200/README.md:
24-
python3 src/train.py recipes/RTX_3060_12GB/sql-coder.yaml
24+
accelerate launch src/train.py recipes/RTX_3060_12GB/sql-coder.yaml
2525

2626
docker:
2727
docker build --no-cache -t yuiseki/infinite-tinyllama:latest .

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ VRAM 24GB で日本語データセットでファインチューニングしよ
4040
conda create -n peft
4141
```
4242

43+
```bash
44+
conda install -c nvidia cuda-toolkit=12.1
45+
```
46+
4347
```bash
4448
conda activate peft
4549
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
target_task: tasks/i18n/ja.md
2+
base_model_id: yuiseki/YuisekinAI-mistral-1.1B
3+
model_name: YuisekinAI-mistral-1.1B-aya
4+
output_base_dir: /data/output
5+
dataset_id: CohereForAI/aya_dataset
6+
dataset_input_field_name: inputs
7+
dataset_output_field_name: targets
8+
dataset_filter_field_name: language_code
9+
dataset_filter_field_value: jpn
10+
dataset_train_split_seed: 42
11+
dataset_train_split_test_size: 0.2
12+
lora_r: 8
13+
lora_alpha: 16
14+
lora_dropout: 0.05
15+
train_claim_gpu_num: 1
16+
train_per_device_train_batch_size: 4
17+
train_gradient_accumulation_steps: 32
18+
train_num_train_epochs: 4
19+
train_fp16: True
20+
inference_max_new_tokens: 32
21+
evaluations:
22+
-
23+
prompt: "火縄銃の威力が全国に知られる事となった、1575年に織田・徳川連合軍が鉄砲隊を用いて武田勝頼率いる騎馬隊を破った戦いを何というでしょう?"
24+
expected_output: "長篠の戦いです。"
25+
-
26+
prompt: "ベトナム戦争終結や米ソ戦略兵器削減交渉などを進めたものの、1974年にウォーターゲート事件の責任をとって辞任したアメリカの第37代大統領は誰でしょう?"
27+
expected_output: "リチャード・ニクソンです。"
28+
-
29+
prompt: "格闘家ボブ・サップの出身国はどこでしょう?"
30+
expected_output: "アメリカです。"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
target_task: tasks/i18n/ja.md
2+
base_model_id: TinyLlama/TinyLlama-1.1B-intermediate-step-715k-1.5T
3+
model_name: tinyllama-ja-wikibook-hs-v0.1
4+
output_base_dir: /data/output
5+
dataset_id: DataPilot/wikibook_High_School_textbooks
6+
dataset_input_field_name: text
7+
dataset_train_split_seed: 42
8+
dataset_train_split_test_size: 0.2
9+
lora_r: 8
10+
lora_alpha: 16
11+
lora_dropout: 0.05
12+
train_claim_gpu_num: 4
13+
train_per_device_train_batch_size: 8
14+
train_gradient_accumulation_steps: 4
15+
train_num_train_epochs: 4
16+
train_max_steps: 2000
17+
train_fp16: True
18+
inference_max_new_tokens: 32
19+
evaluations:
20+
-
21+
prompt: "2つ以上の文字を組み合わせて図案化したもののことで、特にルイ・ヴィトンのものが知られるのは何でしょう?"
22+
expected_output: "モノグラム"
23+
-
24+
prompt: "幾つかの布切れを縫いあわせ、飾りや模様を作る手芸方法を何というでしょう?"
25+
expected_output: "パッチワーク"
26+
-
27+
prompt: "格闘家ボブ・サップの出身国はどこでしょう?"
28+
expected_output: "アメリカ"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
target_task: tasks/i18n/ja.md
2+
base_model_id: TinyLlama/TinyLlama-1.1B-intermediate-step-715k-1.5T
3+
model_name: tinyllama-ja-wikibook-jhs-v0.1
4+
output_base_dir: /data/output
5+
dataset_id: DataPilot/wikibook_Junior_High_School_textbooks_ja
6+
dataset_input_field_name: text
7+
dataset_train_split_seed: 42
8+
dataset_train_split_test_size: 0.2
9+
lora_r: 8
10+
lora_alpha: 16
11+
lora_dropout: 0.05
12+
train_claim_gpu_num: 4
13+
train_per_device_train_batch_size: 8
14+
train_gradient_accumulation_steps: 4
15+
train_num_train_epochs: 4
16+
train_max_steps: 2000
17+
train_fp16: True
18+
inference_max_new_tokens: 32
19+
evaluations:
20+
-
21+
prompt: "2つ以上の文字を組み合わせて図案化したもののことで、特にルイ・ヴィトンのものが知られるのは何でしょう?"
22+
expected_output: "モノグラム"
23+
-
24+
prompt: "幾つかの布切れを縫いあわせ、飾りや模様を作る手芸方法を何というでしょう?"
25+
expected_output: "パッチワーク"
26+
-
27+
prompt: "格闘家ボブ・サップの出身国はどこでしょう?"
28+
expected_output: "アメリカ"

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,6 @@ bitsandbytes
44
transformers
55
trl
66
wandb
7+
flash_attn
78

89
types-PyYAML

src/train.py

+21-14
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import os
22
import sys
33

4-
import wandb
4+
import torch
55
import yaml
6+
from accelerate import PartialState
67
from datasets.arrow_dataset import Dataset
78
from datasets.load import load_dataset
89
from peft import LoraConfig
@@ -14,6 +15,8 @@
1415
)
1516
from trl import SFTTrainer
1617

18+
import wandb
19+
1720
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
1821

1922

@@ -176,24 +179,35 @@ def load_model_and_tokenizer(model_id):
176179
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
177180
# NOTE: tokenizer.add_special_tokensやるならこれは不要
178181
tokenizer.pad_token = tokenizer.eos_token
182+
tokenizer.padding_side = "right"
179183

180184
# Define the quantization configuration for memory-efficient training.
181185
bnb_config = BitsAndBytesConfig(
182186
# Load the model weights in 4-bit quantized format.
183187
load_in_4bit=True,
188+
# Specify whether to use double quantization for 4-bit quantization.
189+
bnb_4bit_use_double_quant=True,
184190
# Specify the quantization type to use for 4-bit quantization.
185191
bnb_4bit_quant_type="nf4",
186192
# Specify the data type to use for computations during training.
187-
bnb_4bit_compute_dtype="float16",
188-
# Specify whether to use double quantization for 4-bit quantization.
189-
bnb_4bit_use_double_quant=True,
193+
bnb_4bit_compute_dtype=torch.float16,
190194
)
191195
# Load the model from the specified model ID and apply the quantization configuration.
196+
192197
model = AutoModelForCausalLM.from_pretrained(
198+
# Base model id
193199
model_id,
200+
# BitsAndBytes configuration
194201
quantization_config=bnb_config,
202+
# Set torch dtype
203+
torch_dtype=torch.float16,
204+
# Trust remote code
195205
trust_remote_code=True,
196-
device_map="auto",
206+
# Set device map to auto
207+
# device_map="auto",
208+
device_map={"": PartialState().process_index},
209+
# Set the attention impl
210+
attn_implementation="flash_attention_2",
197211
)
198212
# Disable cache to improve training speed.
199213
model.config.use_cache = False
@@ -222,12 +236,6 @@ def load_model_and_tokenizer(model_id):
222236
os.environ["WANDB_PROJECT"] = "infinite-tinyllama"
223237
os.environ["WANDB_LOG_MODEL"] = "false"
224238
os.environ["WANDB_WATCH"] = "all"
225-
wandb.init(
226-
project="infinite-tinyllama",
227-
name=train_config["model_name"],
228-
group=train_config["model_name"],
229-
config=train_config,
230-
)
231239

232240
#
233241
# Define LoRA and PEFT config
@@ -249,12 +257,11 @@ def load_model_and_tokenizer(model_id):
249257
optim="paged_adamw_32bit",
250258
learning_rate=2e-4,
251259
lr_scheduler_type="cosine",
252-
save_strategy="steps",
253-
save_steps=100,
260+
save_strategy="epoch",
254261
logging_steps=10,
255262
num_train_epochs=int(train_config["train_num_train_epochs"]),
256-
max_steps=int(train_config["train_max_steps"]),
257263
fp16=True,
264+
run_name=train_config["model_name"],
258265
)
259266

260267
trainer = SFTTrainer(

0 commit comments

Comments
 (0)