[ds fp4] set block-size to 16

ZhiweiYan-96 · ZhiweiYan-96 · commit df11d7a907a2 · 2025-11-19T07:51:47.000Z
diff --git a/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh b/evaluation/deepseek_fp4/launch_deepseekr1_fp4_TP.sh
@@ -30,16 +30,16 @@ echo "running $model_path"
 
 vllm serve $model_path \
   --host localhost \
-  --port 6789 \
+  --port 9000 \
   --tensor-parallel-size 8 \
   --max-num-batched-tokens 32768 \
   --trust-remote-code \
   --no-enable-prefix-caching \
   --disable-log-requests \
-  --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
+  --enforce-eager \
   --gpu_memory_utilization 0.7 \
-  --block-size 1 \
-  --seed 123 2>&1 | tee log.server.log
+  --block-size 16 \
+  --seed 123 2>&1 | tee log.server.log &
 
-  # --enforce-eager \
+# --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
   # --enable-expert-parallel \