|
3 | 3 | import yaml
|
4 | 4 | from datasets.load import load_dataset
|
5 | 5 |
|
6 |
| -# load_dataset("oscar") |
7 |
| -load_dataset("cc100", "en", trust_remote_code=True) |
8 |
| -load_dataset("cc100", "ja", trust_remote_code=True) |
9 |
| -load_dataset("cerebras/SlimPajama-627B", trust_remote_code=True) |
10 |
| -load_dataset("bigcode/starcoderdata", trust_remote_code=True) |
11 |
| -load_dataset("Open-Orca/OpenOrca", trust_remote_code=True) |
| 6 | +# TinyLlamaが使ってる |
12 | 7 | load_dataset("HuggingFaceH4/ultrafeedback_binarized", trust_remote_code=True)
|
| 8 | +# TinyLlamaが使ってる |
13 | 9 | load_dataset("HuggingFaceH4/ultrachat_200k", trust_remote_code=True)
|
14 |
| -load_dataset("cognitivecomputations/dolphin", trust_remote_code=True) |
15 |
| -load_dataset("LDJnr/Capybara", trust_remote_code=True) |
| 10 | +# dolphinが使ってる |
16 | 11 | load_dataset("ise-uiuc/Magicoder-Evol-Instruct-110K", trust_remote_code=True)
|
| 12 | +# マルチターン対話 |
| 13 | +load_dataset("LDJnr/Capybara", trust_remote_code=True) |
| 14 | + |
| 15 | +# GBオーダー |
| 16 | +load_dataset("cognitivecomputations/dolphin", trust_remote_code=True) |
| 17 | +load_dataset("Open-Orca/OpenOrca", trust_remote_code=True) |
| 18 | + |
| 19 | +# デカい |
| 20 | +load_dataset("cc100", "en", trust_remote_code=True) |
| 21 | +load_dataset("cc100", "ja", trust_remote_code=True) |
17 | 22 | load_dataset("allenai/c4", "en", trust_remote_code=True)
|
18 | 23 | load_dataset("allenai/c4", "ja", trust_remote_code=True)
|
19 |
| -load_dataset("the_pile", "all", trust_remote_code=True) |
| 24 | + |
| 25 | +# デカすぎる |
| 26 | +# TinyLlamaが使ってる、895 GB |
| 27 | +# load_dataset("cerebras/SlimPajama-627B", trust_remote_code=True) |
| 28 | +# TinyLlamaが使ってる、311 GB |
| 29 | +# load_dataset("bigcode/starcoderdata", trust_remote_code=True) |
| 30 | +# 825 GB |
| 31 | +# load_dataset("EleutherAI/pile", "all", trust_remote_code=True) |
| 32 | +# load_dataset("oscar") |
20 | 33 |
|
21 | 34 |
|
22 | 35 | # 指定されたファイルパスからyamlファイルを読み込む
|
|
0 commit comments