Skip to content

Commit 701a057

Browse files
committed
, trust_remote_code=True
1 parent 83aac05 commit 701a057

File tree

2 files changed

+45
-14
lines changed

2 files changed

+45
-14
lines changed
+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
target_task: tasks/i18n/fr.md
2+
base_model_id: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
3+
model_name: tinyllama-en-wikipedia-aya-1.5T-v0.1
4+
output_base_dir: /data/output
5+
dataset_id: CohereForAI/aya_dataset
6+
dataset_input_field_name: inputs
7+
dataset_output_field_name: targets
8+
dataset_filter_field_name: language_code
9+
dataset_filter_field_value: eng
10+
dataset_train_split_seed: 42
11+
dataset_train_split_test_size: 0.2
12+
lora_r: 8
13+
lora_alpha: 16
14+
lora_dropout: 0.05
15+
train_claim_gpu_num: 3
16+
train_per_device_train_batch_size: 8
17+
train_gradient_accumulation_steps: 4
18+
train_num_train_epochs: 4
19+
train_max_steps: 1000
20+
train_fp16: True
21+
inference_max_new_tokens: 32
22+
evaluations:
23+
-
24+
prompt: "火縄銃の威力が全国に知られる事となった、1575年に織田・徳川連合軍が鉄砲隊を用いて武田勝頼率いる騎馬隊を破った戦いを何というでしょう?"
25+
expected_output: "長篠の戦いです。"
26+
-
27+
prompt: "ベトナム戦争終結や米ソ戦略兵器削減交渉などを進めたものの、1974年にウォーターゲート事件の責任をとって辞任したアメリカの第37代大統領は誰でしょう?"
28+
expected_output: "リチャード・ニクソンです。"
29+
-
30+
prompt: "格闘家ボブ・サップの出身国はどこでしょう?"
31+
expected_output: "アメリカです。"

src/dataset/load.py

+14-14
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,19 @@
44
from datasets.load import load_dataset
55

66
# load_dataset("oscar")
7-
load_dataset("cc100", "en")
8-
load_dataset("cc100", "ja")
9-
load_dataset("cerebras/SlimPajama-627B")
10-
load_dataset("bigcode/starcoderdata")
11-
load_dataset("Open-Orca/OpenOrca")
12-
load_dataset("HuggingFaceH4/ultrafeedback_binarized")
13-
load_dataset("HuggingFaceH4/ultrachat_200k")
14-
load_dataset("cognitivecomputations/dolphin")
15-
load_dataset("LDJnr/Capybara")
16-
load_dataset("ise-uiuc/Magicoder-Evol-Instruct-110K")
17-
load_dataset("allenai/c4", "en")
18-
load_dataset("allenai/c4", "ja")
19-
load_dataset("the_pile", "all")
7+
load_dataset("cc100", "en", trust_remote_code=True)
8+
load_dataset("cc100", "ja", trust_remote_code=True)
9+
load_dataset("cerebras/SlimPajama-627B", trust_remote_code=True)
10+
load_dataset("bigcode/starcoderdata", trust_remote_code=True)
11+
load_dataset("Open-Orca/OpenOrca", trust_remote_code=True)
12+
load_dataset("HuggingFaceH4/ultrafeedback_binarized", trust_remote_code=True)
13+
load_dataset("HuggingFaceH4/ultrachat_200k", trust_remote_code=True)
14+
load_dataset("cognitivecomputations/dolphin", trust_remote_code=True)
15+
load_dataset("LDJnr/Capybara", trust_remote_code=True)
16+
load_dataset("ise-uiuc/Magicoder-Evol-Instruct-110K", trust_remote_code=True)
17+
load_dataset("allenai/c4", "en", trust_remote_code=True)
18+
load_dataset("allenai/c4", "ja", trust_remote_code=True)
19+
load_dataset("the_pile", "all", trust_remote_code=True)
2020

2121

2222
# 指定されたファイルパスからyamlファイルを読み込む
@@ -43,7 +43,7 @@ def load_yaml(file_path):
4343
is_lte_10gb_dataset = data["converted_size"][-2:] == "GB" and float(data["converted_size"][:-2]) <= 10
4444
# MBオーダーか10GB以下のデータセットの場合のみ読み込む
4545
if is_mb_dataset or is_lte_10gb_dataset:
46-
dataset = load_dataset(data["id"])
46+
dataset = load_dataset(data["id"], trust_remote_code=True)
4747
print(dataset)
4848
except Exception as e:
4949
print(f"Error loading dataset: {e}")

0 commit comments

Comments
 (0)