5
5
from transformers import AutoTokenizer
6
6
from pathlib import Path
7
7
8
+ BASE_DIR = Path (__file__ ).parent
9
+
10
+ PYTHON_URL = 'hf://datasets/iamtarun/python_code_instructions_18k_alpaca/data/train-00000-of-00001-8b6e212f3e1ece96.parquet'
11
+ SHAKESPEARE_URL = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
12
+
8
13
def main ():
9
- data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
10
- shakespeare_text = requests .get (data_url ).text
14
+ shakespeare_text = requests .get (SHAKESPEARE_URL ).text
11
15
12
16
# Add in some Python code training data so the model learns both Shakespare and Python
13
- df = pd .read_parquet (
14
- 'hf://datasets/iamtarun/python_code_instructions_18k_alpaca/data/train-00000-of-00001-8b6e212f3e1ece96.parquet'
15
- )
17
+ df = pd .read_parquet (PYTHON_URL )
16
18
python_code = '\n ###\n ' .join (df ['output' ].dropna ().astype (str ))
17
19
python_code = python_code .encode ('ascii' , 'ignore' ).decode () # there's a few non-ascii characters but I don't want to deal with them
18
20
@@ -37,8 +39,8 @@ def main():
37
39
# export to bin files
38
40
train_ids = np .array (train_ids , dtype = np .uint16 )
39
41
val_ids = np .array (val_ids , dtype = np .uint16 )
40
- train_ids .tofile (Path ( __file__ ). parent / 'train.bin' )
41
- val_ids .tofile (Path ( __file__ ). parent / 'val.bin' )
42
+ train_ids .tofile (BASE_DIR / 'train.bin' )
43
+ val_ids .tofile (BASE_DIR / 'val.bin' )
42
44
43
45
# save the meta information as well, to help us encode/decode later
44
46
meta = {
@@ -47,7 +49,7 @@ def main():
47
49
'encode' : new_tokenizer .encode ,
48
50
'decode' : new_tokenizer .decode ,
49
51
}
50
- with open (Path ( __file__ ). parent / 'meta.pkl' , 'wb' ) as f :
52
+ with open (BASE_DIR / 'meta.pkl' , 'wb' ) as f :
51
53
pickle .dump (meta , f )
52
54
53
55
if __name__ == '__main__' :
0 commit comments