@@ -113,9 +113,9 @@ def prepare_train_data(dataset_id):
113
113
if "dataset_load_config" in train_config :
114
114
dataset_load_config = train_config ["dataset_load_config" ]
115
115
data = load_dataset (dataset_id , dataset_load_config , split = "train" , num_proc = 32 )
116
- if dataset_load_config == "20231101.ja" or dataset_load_config == "20231101.vi" or dataset_load_config == "20231101.es" :
116
+ if dataset_load_config == "20231101.ja" or dataset_load_config == "20231101.vi" or dataset_load_config == "20231101.es" or dataset_load_config == "20231101.it" :
117
117
data = data .filter (lambda item , idx : idx % 3 == 0 , with_indices = True )
118
- if dataset_load_config == "20231101.de" :
118
+ if dataset_load_config == "20231101.de" or dataset_load_config == "20231101.fr" :
119
119
data = data .filter (lambda item , idx : idx % 5 == 0 , with_indices = True )
120
120
else :
121
121
data = load_dataset (dataset_id , split = "train" , num_proc = 32 )
@@ -162,7 +162,8 @@ def prepare_train_data(dataset_id):
162
162
lambda x : simple_template_for_train (x [input_field_name ], x [output_field_name ]),
163
163
axis = 1 ,
164
164
)
165
-
165
+ # keep only text field
166
+ data = data_df [["text" ]]
166
167
data = Dataset .from_pandas (data_df )
167
168
data = data .train_test_split (seed = 42 , test_size = 0.2 )
168
169
print (len (data ["train" ]))
@@ -281,7 +282,7 @@ def load_model_and_tokenizer(model_id):
281
282
args = training_arguments ,
282
283
tokenizer = tokenizer ,
283
284
packing = False ,
284
- max_seq_length = 512 ,
285
+ max_seq_length = 1024 ,
285
286
)
286
287
287
288
#
0 commit comments