Add llama-3 instructions to readme (#79)

bhavya01 · web-flow · commit 648bf48ae566 · 2024-05-13T15:33:37.000-07:00
* Add llama-3 instructions to readme

* Use the same sharding for llama-2 and llama-3
diff --git a/README.md b/README.md
@@ -46,7 +46,9 @@ NOTE: the above script will export PYTHONPATH, so sourcing will make it to take
 ## LLaMA
 ### Get official llama weights from meta-llama
 
-Following instructions here: https://github.com/meta-llama/llama#download
+Following instructions here: 
+* Llama-2: https://github.com/meta-llama/llama#download
+* Llama-3: https://github.com/meta-llama/llama3/#download
 
 After you have downloaded the weights, it will also download a `tokenizer.model` file that is 
 the tokenizer that we will use.
@@ -68,7 +70,7 @@ Need to manually modify the `config.json` in the checkpoint folder to make it a
 export input_ckpt_dir=Original llama weights directory
 export output_ckpt_dir=The output directory
 export quantize=True #whether to quantize
-export model_name="llama-2" # or "gemma"
+export model_name="llama-3" # or "llama-2", "gemma"
 python -m convert_checkpoints --model_name=$model_name --input_checkpoint_dir=$input_ckpt_dir --output_checkpoint_dir=$output_ckpt_dir --quantize=$quantize
 ```
 
@@ -80,16 +82,20 @@ Set tokenizer path
 export tokenizer_path=tokenizer model file path
 ```
 
-## Llama 7b
+## Llama-2 7b
 ```bash
-python run_interactive.py --size=7b --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name.yaml
+python run_interactive.py --size=7b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama.yaml
 ```
 
-## Llama 13b
+## Llama-2 13b
 ```bash
-python run_interactive.py --size=13b --batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name.yaml
+python run_interactive.py --size=13b --model_name=$model_name --batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama.yaml
 ```
 
+## Llama-3 8b
+```bash
+python run_interactive.py --size=8b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama.yaml
+```
 
 ## Gemma 7b
 ```bash
@@ -101,7 +107,7 @@ python run_interactive.py --model_name=$model_name --size=7b --batch_size=64 --m
 NOTE: the `--platform=tpu=8` need to specify number of tpu devices (which is 4 for v4-8 and 8 for v5light-8`)
 
 ```bash
-python run_server.py --param_size=7b --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir   --tokenizer_path=$tokenizer_path --platform=tpu=8 --model=$model_name
+python run_server.py --param_size=7b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir   --tokenizer_path=$tokenizer_path --platform=tpu=8 --model=$model_name
 ```
 Now you can fire gRPC to it
 
diff --git a/default_shardings/llama.yaml b/default_shardings/llama.yaml
@@ -5,7 +5,7 @@
 
 
 freqs_cis : -1 #  torch.complex64 (2048, 64)
-tok_embeddings.weight : 1 #  torch.float32 (32000, 4096)
+tok_embeddings.weight : 1 #  torch.float32 (vocab_size, 4096)
 tok_embeddings.weight_scaler : 0 #  torch.bfloat16 (4096,)
 layers.*.attention.wo.weight : 1 #  torch.int8 (4096, 4096)
 layers.*.attention.wo.weight_scaler : 0 #  torch.bfloat16 (4096,)
@@ -24,5 +24,5 @@ layers.*.feed_forward.w3.weight_scaler : 0 # torch.bfloat16 (4096,)
 layers.*.attention_norm.weight : -1 #  torch.float32 (4096,)
 layers.*.ffn_norm.weight : -1 #  torch.float32 (4096,)
 norm.weight : -1 #  torch.float32 (4096,)
-output.weight : 0 #  torch.float32 (32000, 4096)
+output.weight : 0 #  torch.float32 (vocab_size, 4096)
 output.weight_scaler : 0 #  torch.float32 (4096,)
diff --git a/jetstream_pt/engine.py b/jetstream_pt/engine.py
@@ -701,7 +701,6 @@ def create_pytorch_engine(
     checkpoint_format = "safetensors"
     checkpoint_path = paths[0]
 
-  tokenizer = token_utils.load_vocab(tokenizer_path)
   pt_model = None
 
   if not sharding_config:
@@ -734,7 +733,7 @@ def create_pytorch_engine(
         max_cache_length,
         args.dim // args.n_heads,
     )
-    env_data.model_type = "llama-2-" + param_size
+    env_data.model_type = model_name + "-" + param_size
     env_data.num_layers = args.n_layers
     env = JetEngineEnvironment(env_data)
     pt_model = model_exportable.Transformer(args, env)
@@ -746,7 +745,7 @@ def create_pytorch_engine(
         max_cache_length,
         args.head_dim,
     )
-    env_data.model_type = "gemma-" + param_size
+    env_data.model_type = model_name + "-" + param_size
     env_data.num_layers = args.num_hidden_layers
     env = JetEngineEnvironment(env_data)
     pt_model = gemma_model.GemmaModel(args, env)