Skip to content

Commit 16c8fc3

Browse files
committed
Use simple_template_for_pretrain
1 parent 42a4a17 commit 16c8fc3

9 files changed

+10
-22
lines changed

recipes/A5000_24GB_x8/i18n-ml-wikipedia.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ model_name: tinyllama-ml-wikipedia-1.5T-v0.1
44
output_base_dir: /data/output
55
dataset_id: wikimedia/wikipedia
66
dataset_load_config: 20231101.ml
7-
dataset_input_field_name: title
8-
dataset_output_field_name: text
7+
dataset_input_field_name: text
98
dataset_train_split_seed: 42
109
dataset_train_split_test_size: 0.2
1110
lora_r: 8

recipes/A5000_24GB_x8/i18n-ms-wikipedia.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ model_name: tinyllama-ms-wikipedia-1.5T-v0.1
44
output_base_dir: /data/output
55
dataset_id: wikimedia/wikipedia
66
dataset_load_config: 20231101.ms
7-
dataset_input_field_name: title
8-
dataset_output_field_name: text
7+
dataset_input_field_name: text
98
dataset_train_split_seed: 42
109
dataset_train_split_test_size: 0.2
1110
lora_r: 8

recipes/A5000_24GB_x8/i18n-rm-wikipedia.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ model_name: tinyllama-rm-wikipedia-1.5T-v0.1
44
output_base_dir: /data/output
55
dataset_id: wikimedia/wikipedia
66
dataset_load_config: 20231101.rm
7-
dataset_input_field_name: title
8-
dataset_output_field_name: text
7+
dataset_input_field_name: text
98
dataset_train_split_seed: 42
109
dataset_train_split_test_size: 0.2
1110
lora_r: 8

recipes/A5000_24GB_x8/i18n-si-wikipedia.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ model_name: tinyllama-si-wikipedia-1.5T-v0.1
44
output_base_dir: /data/output
55
dataset_id: wikimedia/wikipedia
66
dataset_load_config: 20231101.si
7-
dataset_input_field_name: title
8-
dataset_output_field_name: text
7+
dataset_input_field_name: text
98
dataset_train_split_seed: 42
109
dataset_train_split_test_size: 0.2
1110
lora_r: 8

recipes/A5000_24GB_x8/i18n-sq-wikipedia.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ model_name: tinyllama-sq-wikipedia-1.5T-v0.1
44
output_base_dir: /data/output
55
dataset_id: wikimedia/wikipedia
66
dataset_load_config: 20231101.sq
7-
dataset_input_field_name: title
8-
dataset_output_field_name: text
7+
dataset_input_field_name: text
98
dataset_train_split_seed: 42
109
dataset_train_split_test_size: 0.2
1110
lora_r: 8

recipes/A5000_24GB_x8/i18n-sr-wikipedia.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ model_name: tinyllama-sr-wikipedia-1.5T-v0.1
44
output_base_dir: /data/output
55
dataset_id: wikimedia/wikipedia
66
dataset_load_config: 20231101.sr
7-
dataset_input_field_name: title
8-
dataset_output_field_name: text
7+
dataset_input_field_name: text
98
dataset_train_split_seed: 42
109
dataset_train_split_test_size: 0.2
1110
lora_r: 8

recipes/A5000_24GB_x8/i18n-ta-wikipedia.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ model_name: tinyllama-ta-wikipedia-1.5T-v0.1
44
output_base_dir: /data/output
55
dataset_id: wikimedia/wikipedia
66
dataset_load_config: 20231101.ta
7-
dataset_input_field_name: title
8-
dataset_output_field_name: text
7+
dataset_input_field_name: text
98
dataset_train_split_seed: 42
109
dataset_train_split_test_size: 0.2
1110
lora_r: 8

recipes/A5000_24GB_x8/i18n-yo-wikipedia.yaml

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,7 @@ model_name: tinyllama-yo-wikipedia-1.5T-v0.1
44
output_base_dir: /data/output
55
dataset_id: wikimedia/wikipedia
66
dataset_load_config: 20231101.yo
7-
dataset_input_field_name: title
8-
dataset_output_field_name: text
7+
dataset_input_field_name: text
98
dataset_train_split_seed: 42
109
dataset_train_split_test_size: 0.2
1110
lora_r: 8

src/train.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,7 @@ def load_yaml(file_path):
3535
def simple_template_for_pretrain(input) -> str:
3636
# inputから、2つ以上連続する改行を除去する
3737
input = "\n".join([line for line in input.splitlines() if line.strip() != ""])
38-
template = f"""\
39-
{input}\
40-
"""
38+
template = input
4139
# Remove any leading whitespace characters from each line in the template.
4240
template = "\n".join([line.lstrip() for line in template.splitlines()])
4341
return template
@@ -127,9 +125,7 @@ def prepare_train_data(dataset_id):
127125
output_field_name = train_config["dataset_output_field_name"]
128126
if "dataset_output_field_values_to_texts" in train_config:
129127
output_field_values_to_texts = train_config["dataset_output_field_values_to_texts"]
130-
data_df[output_field_name] = data_df[output_field_name].apply(
131-
lambda x: output_field_values_to_texts.get(x, x)
132-
)
128+
data_df[output_field_name] = data_df[output_field_name].apply(lambda x: output_field_values_to_texts.get(x, x))
133129
if "dataset_context_field_name" in train_config:
134130
context_field_name = train_config["dataset_context_field_name"]
135131
if "dataset_context_hint" not in train_config:

0 commit comments

Comments
 (0)