Skip to content

Commit f87036a

Browse files
committed
Update src/dataset/load.py
1 parent 701a057 commit f87036a

File tree

1 file changed

+22
-9
lines changed

1 file changed

+22
-9
lines changed

src/dataset/load.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,33 @@
33
import yaml
44
from datasets.load import load_dataset
55

6-
# load_dataset("oscar")
7-
load_dataset("cc100", "en", trust_remote_code=True)
8-
load_dataset("cc100", "ja", trust_remote_code=True)
9-
load_dataset("cerebras/SlimPajama-627B", trust_remote_code=True)
10-
load_dataset("bigcode/starcoderdata", trust_remote_code=True)
11-
load_dataset("Open-Orca/OpenOrca", trust_remote_code=True)
6+
# TinyLlamaが使ってる
127
load_dataset("HuggingFaceH4/ultrafeedback_binarized", trust_remote_code=True)
8+
# TinyLlamaが使ってる
139
load_dataset("HuggingFaceH4/ultrachat_200k", trust_remote_code=True)
14-
load_dataset("cognitivecomputations/dolphin", trust_remote_code=True)
15-
load_dataset("LDJnr/Capybara", trust_remote_code=True)
10+
# dolphinが使ってる
1611
load_dataset("ise-uiuc/Magicoder-Evol-Instruct-110K", trust_remote_code=True)
12+
# マルチターン対話
13+
load_dataset("LDJnr/Capybara", trust_remote_code=True)
14+
15+
# GBオーダー
16+
load_dataset("cognitivecomputations/dolphin", trust_remote_code=True)
17+
load_dataset("Open-Orca/OpenOrca", trust_remote_code=True)
18+
19+
# デカい
20+
load_dataset("cc100", "en", trust_remote_code=True)
21+
load_dataset("cc100", "ja", trust_remote_code=True)
1722
load_dataset("allenai/c4", "en", trust_remote_code=True)
1823
load_dataset("allenai/c4", "ja", trust_remote_code=True)
19-
load_dataset("the_pile", "all", trust_remote_code=True)
24+
25+
# デカすぎる
26+
# TinyLlamaが使ってる、895 GB
27+
# load_dataset("cerebras/SlimPajama-627B", trust_remote_code=True)
28+
# TinyLlamaが使ってる、311 GB
29+
# load_dataset("bigcode/starcoderdata", trust_remote_code=True)
30+
# 825 GB
31+
# load_dataset("EleutherAI/pile", "all", trust_remote_code=True)
32+
# load_dataset("oscar")
2033

2134

2235
# 指定されたファイルパスからyamlファイルを読み込む

0 commit comments

Comments
 (0)