Add Gemma 2b benchmark; fix a typo. (#81)

qihqi · web-flow · commit f3cf2b738136 · 2024-05-14T21:42:31.000-07:00
diff --git a/README.md b/README.md
@@ -109,7 +109,17 @@ NOTE: the `--platform=tpu=8` need to specify number of tpu devices (which is 4 f
 ```bash
 python run_server.py --param_size=7b --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir   --tokenizer_path=$tokenizer_path --platform=tpu=8 --model=$model_name
 ```
-Now you can fire gRPC to it
+
+Now you can fire gRPC to it.
+
+Optional flags: 
+* `--shard_on_batch=1` This makes the model to shard on 
+  the batch dimension. I.e. this runs in data parallel mode instead of model
+  parallel. This will ignore the sharding config. This is recommended for Gemma 2B
+  model, because Gemma 2B is small enough to fit on a single TPU chip.
+
+* `--sharding_config=<path>` This makes use of alternative sharding config instead of
+  the ones in default_shardings directory.
 
 # Run benchmark
 go to the deps/JetStream folder (downloaded during `install_everything.sh`)
diff --git a/benchmarks/summary.md b/benchmarks/summary.md
@@ -16,6 +16,17 @@ Date | Device  | dtype | batch size | cache length |max input length |max output
 2024-05-10 | TPU v5e-8 | bfloat16 | 96 | 2048 | 1024 | 1024 | 3236
 2024-05-10 | TPU v5e-8 | int8 | 128 | 2048 | 1024 | 1024 | 4695
 
+## Gemma - 2B
+
+Date | Device  | dtype | batch size | cache length |max input length |max output length| throughput (token/s) 
+----| ------- | ------ |---------- | -------------|-----------------|------------------|----------------------
+2024-05-14 | TPU v5e-8 | bfloat16 | 512 | 2048 | 1024 | 1024 | 8700
+2024-05-14 | TPU v5e-8 | int8 | 1024 | 2048 | 1024 | 1024 | 8746
+
+** NOTE: ** Gemma 2B uses `--shard_on_batch` flag so it's data parallel instead
+of model parallel.
+
+
 ## Llama 2 - 7B
 
 Date | Device  | dtype | batch size | cache length |max input length |max output length| throughput (token/s) 
diff --git a/jetstream_pt/environment.py b/jetstream_pt/environment.py
@@ -176,7 +176,7 @@ def make_caches_generate(self):
   def sharding_by_name(self, name):
     """Create sharding specified in the config."""
     if self.shard_on_batch:
-      return self.shading_by_axis(0)  # batch dimension
+      return self.sharding_by_axis(0)  # batch dimension
 
     if name in self._sharding_config:
       return self.sharding_by_axis(self._sharding_config[name])