@@ -46,7 +46,9 @@ NOTE: the above script will export PYTHONPATH, so sourcing will make it to take
46
46
## LLaMA
47
47
### Get official llama weights from meta-llama
48
48
49
- Following instructions here: https://github.com/meta-llama/llama#download
49
+ Following instructions here:
50
+ * Llama-2: https://github.com/meta-llama/llama#download
51
+ * Llama-3: https://github.com/meta-llama/llama3/#download
50
52
51
53
After you have downloaded the weights, it will also download a ` tokenizer.model ` file that is
52
54
the tokenizer that we will use.
@@ -68,7 +70,7 @@ Need to manually modify the `config.json` in the checkpoint folder to make it a
68
70
export input_ckpt_dir=Original llama weights directory
69
71
export output_ckpt_dir=The output directory
70
72
export quantize=True # whether to quantize
71
- export model_name=" llama-2 " # or "gemma"
73
+ export model_name=" llama-3 " # or "llama-2", "gemma"
72
74
python -m convert_checkpoints --model_name=$model_name --input_checkpoint_dir=$input_ckpt_dir --output_checkpoint_dir=$output_ckpt_dir --quantize=$quantize
73
75
```
74
76
@@ -80,16 +82,20 @@ Set tokenizer path
80
82
export tokenizer_path=tokenizer model file path
81
83
```
82
84
83
- ## Llama 7b
85
+ ## Llama-2 7b
84
86
``` bash
85
- python run_interactive.py --size=7b --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name .yaml
87
+ python run_interactive.py --size=7b --model_name= $model_name -- batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama .yaml
86
88
```
87
89
88
- ## Llama 13b
90
+ ## Llama-2 13b
89
91
``` bash
90
- python run_interactive.py --size=13b --batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name .yaml
92
+ python run_interactive.py --size=13b --model_name= $model_name -- batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama .yaml
91
93
```
92
94
95
+ ## Llama-3 8b
96
+ ``` bash
97
+ python run_interactive.py --size=8b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/llama.yaml
98
+ ```
93
99
94
100
## Gemma 7b
95
101
``` bash
@@ -101,7 +107,7 @@ python run_interactive.py --model_name=$model_name --size=7b --batch_size=64 --m
101
107
NOTE: the ` --platform=tpu=8 ` need to specify number of tpu devices (which is 4 for v4-8 and 8 for v5light-8`)
102
108
103
109
``` bash
104
- python run_server.py --param_size=7b --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --platform=tpu=8 --model=$model_name
110
+ python run_server.py --param_size=7b --model_name= $model_name -- batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --platform=tpu=8 --model=$model_name
105
111
```
106
112
Now you can fire gRPC to it
107
113
0 commit comments