Small fixup

thedch · thedch · commit 3a66fd9f5e5d · 2024-07-30T11:37:26.000-07:00
diff --git a/transformer/data/shakespeare_char/prepare.py b/transformer/data/shakespeare_char/prepare.py
@@ -5,14 +5,16 @@
 from transformers import AutoTokenizer
 from pathlib import Path
 
+BASE_DIR = Path(__file__).parent
+
+PYTHON_URL = 'hf://datasets/iamtarun/python_code_instructions_18k_alpaca/data/train-00000-of-00001-8b6e212f3e1ece96.parquet'
+SHAKESPEARE_URL = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+
 def main():
-    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
-    shakespeare_text = requests.get(data_url).text
+    shakespeare_text = requests.get(SHAKESPEARE_URL).text
 
     # Add in some Python code training data so the model learns both Shakespare and Python
-    df = pd.read_parquet(
-        'hf://datasets/iamtarun/python_code_instructions_18k_alpaca/data/train-00000-of-00001-8b6e212f3e1ece96.parquet'
-    )
+    df = pd.read_parquet(PYTHON_URL)
     python_code = '\n###\n'.join(df['output'].dropna().astype(str))
     python_code = python_code.encode('ascii', 'ignore').decode() # there's a few non-ascii characters but I don't want to deal with them
 
@@ -37,8 +39,8 @@ def main():
     # export to bin files
     train_ids = np.array(train_ids, dtype=np.uint16)
     val_ids = np.array(val_ids, dtype=np.uint16)
-    train_ids.tofile(Path(__file__).parent / 'train.bin')
-    val_ids.tofile(Path(__file__).parent / 'val.bin')
+    train_ids.tofile(BASE_DIR / 'train.bin')
+    val_ids.tofile(BASE_DIR / 'val.bin')
 
     # save the meta information as well, to help us encode/decode later
     meta = {
@@ -47,7 +49,7 @@ def main():
         'encode': new_tokenizer.encode,
         'decode': new_tokenizer.decode,
     }
-    with open(Path(__file__).parent / 'meta.pkl', 'wb') as f:
+    with open(BASE_DIR / 'meta.pkl', 'wb') as f:
         pickle.dump(meta, f)
 
 if __name__ == '__main__':