script to run mlperf llama2 70b on gpu

jwyang-google · jwyang-google · commit ffb234e06145 · 2025-03-11T18:56:04.000Z
diff --git a/MaxText/inference_mlperf/gpu/benchmarks_llama2-70b-h100_8.sh b/MaxText/inference_mlperf/gpu/benchmarks_llama2-70b-h100_8.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+
+# Run command:
+# bash benchmarks_llama2-70b-h100_8.sh [-b benchmark_type]
+# benchmark_type can be: performance, audit, accuracy, or all (default)
+
+run_name="h100_llama2-70b"
+dry_run=false
+enable_profiler=false
+test_mode=false
+benchmark_type="performance"
+
+helpFunction()
+{
+   echo ""
+   echo "Usage: $0 [-n] [-p] [-t] [-s] [-x] [-r run_name] [-m token_multiplier] [-b benchmark_type]"
+   echo -e "\t-n Dry run mode"
+   echo -e "\t-p Enable profiler"
+   echo -e "\t-t Test mode"
+   echo -e "\t-r Specify run name"
+   echo -e "\t-b Specify benchmark type (performance|audit|accuracy|all)"
+   exit 1
+}
+
+
+for arg in "$@"; do
+    case $arg in
+        -n) dry_run=true ;;
+        -p) enable_profiler=true ;;
+        -t) test_mode=true ;;
+        -r=*|--run=*) run_name="${arg#*=}" ;;
+        -r|--run) shift; run_name="$1" ;;
+        -b=*|--benchmark=*) benchmark_type="${arg#*=}" ;;
+        -b|--benchmark) shift; benchmark_type="$1" ;;
+        -h|--help) helpFunction ;;
+    esac
+    shift
+done
+
+# Validate benchmark type
+case "$benchmark_type" in
+    performance|audit|accuracy|all) ;;
+    *) echo "Invalid benchmark type. Must be: performance, audit, accuracy, or all"; exit 1 ;;
+esac
+
+
+cmd=''
+RUN_OPTIONS=" -c " # Enable prefill packing by default
+if "$dry_run"; then
+    RUN_OPTIONS="${RUN_OPTIONS} -n "
+fi
+
+if "$enable_profiler"; then
+    RUN_OPTIONS="${RUN_OPTIONS} -p "
+fi
+
+
+if "$test_mode"; then
+    RUN_OPTIONS="${RUN_OPTIONS} -t "
+fi
+
+export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_command_buffer=FUSION --xla_disable_hlo_passes=rematerialization"
+echo XLA_FLAGS: $XLA_FLAGS
+
+# if [[ -z ${QUANTIZATION} ]] ; then
+#   export QUANTIZATION="aqt_fp8"
+# fi
+
+if [[ -z ${KV_QUANT_DTYPE} ]] ; then
+  export KV_QUANT_DTYPE="fp8"
+  export QUANTIZE_KVCACHE=True
+fi
+
+if [[ -z ${CHECKPOINT} ]] ; then
+  export CHECKPOINT="gs://jwyang/maxtext/direct_generate_param_only_checkpoint_llama2_70b_chat/checkpoints/0/items"
+fi
+
+if [[ -z ${TOKENIZER_PATH} ]] ; then
+  export TOKENIZER_PATH="/opt//maxtext/assets/tokenizer.llama2"
+fi
+
+if [ -z "$PREFILL_LENS_AND_PER_DEVICE_BATCH_SIZES" ];
+then
+    PREFILL_LEN="1024"
+    BATCH_SIZE_PER_DEVICE="160" 
+    export PREFILL_LENS_AND_PER_DEVICE_BATCH_SIZES="${PREFILL_LEN},${BATCH_SIZE_PER_DEVICE}"
+fi
+
+
+BASE_CFG="model_name=llama2-70b tokenizer_path=${TOKENIZER_PATH} load_parameters_path=${CHECKPOINT} scan_layers=false hardware=gpu async_checkpointing=False ici_tensor_parallelism=-1 weight_dtype=bfloat16"
+KV_QUANT_CFG="quantize_kvcache=${QUANTIZE_KVCACHE} kv_quant_dtype=${KV_QUANT_DTYPE}"
+export MAXENGINE_ARGS="${BASE_CFG} ${KV_QUANT_CFG} optimize_mesh_for_tpu_v6e=false"
+echo
+echo $MAXENGINE_ARGS
+echo
+RUN_DESC=${run_name}_${PREFILL_LEN}_${BATCH_SIZE_PER_DEVICE}_quant_${QUANTIZATION}_${QUANT_MP}_kv_${KV_QUANT_DTYPE}_opt
+export BASEDIR=/opt/maxtext/Maxtext/inference_mlperf/
+
+$cmd cd ..
+
+run_benchmark() {
+    local type=$1
+    case "$type" in
+        "performance")
+            $cmd bash llama_offline_run.sh ${RUN_OPTIONS} -r -benchmarks_performance_${RUN_DESC}
+            ;;
+        "audit")
+            $cmd bash llama_offline_run.sh ${RUN_OPTIONS} -r -benchmarks_audit_${RUN_DESC} -d
+            ;;
+        "accuracy")
+            export HF_CKPT="meta-llama/Llama-2-70b-chat-hf"
+            $cmd bash llama_offline_run.sh ${RUN_OPTIONS} -r benchmarks_accuracy_${RUN_DESC} -a  
+            ;;
+    esac
+}
+
+if [ "$benchmark_type" = "all" ]; then
+    run_benchmark "performance"
+    run_benchmark "audit"
+    run_benchmark "accuracy"
+else
+    run_benchmark "$benchmark_type"
+fi
+
diff --git a/MaxText/maxengine.py b/MaxText/maxengine.py
@@ -1230,7 +1230,8 @@ def set_engine_vars_from_base_engine(
   """Set internal vars from base_engine, which has already loaded the checkpoint and has sharding,
   mesh, and kv cache related vars set.
   """
-  engine.model.quant.quant_mode = base_engine.model.quant.quant_mode
+  if base_engine.model.quant:
+    engine.model.quant.quant_mode = base_engine.model.quant.quant_mode
   engine.state_mesh_annotations = base_engine.state_mesh_annotations
   engine.abstract_params = base_engine.abstract_params
   engine.kv_cache_annotations = max_utils.get_kv_cache_annotations(engine.model, engine.config, rng, engine.mesh)  # pylint: disable=protected-access