Skip to content

Commit 2f38c51

Browse files
authoredMar 25, 2025··
Add Qwen3 and Qwen3MoE (#3305)
1 parent 81c815e commit 2f38c51

File tree

12 files changed

+240
-6
lines changed

12 files changed

+240
-6
lines changed
 

‎lmdeploy/turbomind/deploy/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ class ModelConfig:
4747
inter_size: List[int] = None
4848
norm_eps: float = None
4949
attn_bias: int = 0
50+
qk_norm: bool = False
5051
size_per_head: int = 128
5152
group_size: int = 64
5253
weight_type: str = None

‎lmdeploy/turbomind/deploy/module.py

+8
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def __init__(self, model: BaseOutputModel):
169169
self.tp = model.tensor_para_size
170170
self.head_dim = model.model_config.size_per_head
171171
self.attn_bias = model.model_config.attn_bias
172+
self.qk_norm = model.model_config.qk_norm
172173

173174
def _reorder_and_merge(self, qkvo):
174175
q, k, v, o = qkvo
@@ -220,6 +221,13 @@ def _export(self, idx: int, qkvo, kind: str, pack_fn, **kwargs):
220221
def apply(self, i: int, r: BaseReader):
221222
for e in get_params(r.attn(i, None), bias=self.attn_bias):
222223
e(self._export, partial(r.attn, i), i)
224+
if self.qk_norm:
225+
q, k = r.qk_norm(i)
226+
if self.model.permute_qk:
227+
q = permute_v2(q, self.head_dim)
228+
k = permute_v2(k, self.head_dim)
229+
self.model.save_split(q, self._attn.format(i, 'q_norm', '')[:-1])
230+
self.model.save_split(k, self._attn.format(i, 'k_norm', '')[:-1])
223231

224232

225233
class MLA(Module):

‎lmdeploy/turbomind/deploy/source_model/qwen.py

+42-2
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,47 @@ def model_info(self):
149149
info['expert_inter_size'] = cfg['moe_intermediate_size']
150150
info['experts_per_token'] = cfg['num_experts_per_tok']
151151
info['inter_size'] = cfg['shared_expert_intermediate_size']
152-
info['moe_shared_gate'] = True
152+
info['moe_shared_gate'] = info['inter_size'] > 0
153153
info['norm_topk_prob'] = cfg['norm_topk_prob']
154-
info['attn_bias'] = 1
154+
info['attn_bias'] = cfg.get('attention_bias', 1)
155+
return info
156+
157+
158+
class Qwen3Reader(LlamaReader):
159+
160+
def qk_norm(self, i: int):
161+
result = []
162+
for x in ['q', 'k']:
163+
name = f'{self.attn_layer_prefix}.{i}.self_attn.{x}_norm.weight'
164+
result.append(self.params.get(name))
165+
return (*result, )
166+
167+
168+
@INPUT_MODELS.register_module(name='qwen3')
169+
class Qwen3Model(LlamaModel):
170+
Reader = Qwen3Reader
171+
172+
def model_info(self):
173+
info = super().model_info()
174+
info['qk_norm'] = True
175+
return info
176+
177+
178+
class Qwen3MoeReader(Qwen2MoeReader):
179+
180+
def qk_norm(self, i: int):
181+
result = []
182+
for x in ['q', 'k']:
183+
name = f'{self.attn_layer_prefix}.{i}.self_attn.{x}_norm.weight'
184+
result.append(self.params.get(name))
185+
return (*result, )
186+
187+
188+
@INPUT_MODELS.register_module(name='qwen3-moe')
189+
class Qwen3MoeModel(Qwen2MoeModel):
190+
Reader = Qwen3MoeReader
191+
192+
def model_info(self):
193+
info = super().model_info()
194+
info['qk_norm'] = True
155195
return info

‎lmdeploy/turbomind/supported_models.py

+3
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
# Qwen2
2424
Qwen2ForCausalLM='qwen2',
2525
Qwen2MoeForCausalLM='qwen2-moe',
26+
# Qwen3
27+
Qwen3ForCausalLM='qwen3',
28+
Qwen3MoeForCausalLM='qwen3-moe',
2629
# mistral
2730
MistralForCausalLM='llama',
2831
# llava

‎src/turbomind/kernels/norm/rms_norm.cu

+104
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
// Copyright (c) OpenMMLab. All rights reserved.
22

3+
#include <stdexcept>
4+
35
#include "cub/block/block_reduce.cuh"
46

57
#include "src/turbomind/kernels/core/array_ops.h"
68
#include "src/turbomind/kernels/core/common.h"
9+
#include "src/turbomind/kernels/core/math.h"
10+
#include "src/turbomind/kernels/core/meta.h"
11+
712
#include "src/turbomind/kernels/norm/rms_norm.h"
13+
#include "src/turbomind/utils/Tensor.h"
814

915
namespace turbomind {
1016

@@ -115,6 +121,104 @@ template void invokeRMSNorm(nv_bfloat16* dst,
115121
cudaStream_t st);
116122
#endif
117123

124+
template<class T, class A, int vec_size, int max_dim>
125+
__global__ void QkRMSNormKernel(T* data, //
126+
int ld,
127+
const T* weight,
128+
int dim,
129+
int n,
130+
int token_num,
131+
float eps,
132+
float inv_dim)
133+
{
134+
static_assert((max_dim & (max_dim - 1)) == 0);
135+
136+
constexpr int thr_per_qk = max_dim / vec_size;
137+
138+
const int bi = (threadIdx.x + blockIdx.x * blockDim.x) / thr_per_qk;
139+
const int di = threadIdx.x % thr_per_qk * vec_size;
140+
const int ti = bi / n;
141+
const int hi = bi % n;
142+
143+
if (bi >= token_num * n) {
144+
return;
145+
}
146+
147+
data += ti * ld + hi * dim;
148+
149+
Array<T, vec_size> vec{};
150+
if (di < dim) {
151+
Load(vec, &data[di]);
152+
}
153+
154+
using namespace ops;
155+
auto acc = cast<A>(vec);
156+
acc = acc * acc;
157+
158+
float sum{};
159+
PRAGMA_UNROLL
160+
for (int i = 0; i < vec_size; ++i) {
161+
sum += acc[i];
162+
}
163+
164+
PRAGMA_UNROLL
165+
for (int mask = thr_per_qk / 2; mask >= 1; mask /= 2) {
166+
sum += __shfl_xor_sync((uint32_t)-1, sum, mask);
167+
}
168+
169+
sum = rsqrtf(sum * inv_dim + eps);
170+
171+
Array<T, vec_size> w;
172+
if (di < dim) {
173+
Ldg(w, &weight[di]);
174+
PRAGMA_UNROLL
175+
for (int i = 0; i < vec_size; ++i) {
176+
vec[i] = (T)((float)vec[i] * sum) * w[i];
177+
}
178+
Store(&data[di], vec);
179+
}
180+
}
181+
182+
void invokeQkRMSNorm(void* data,
183+
int ld,
184+
const void* weight,
185+
DataType dtype,
186+
int head_dim,
187+
int n,
188+
int token_num,
189+
float eps,
190+
cudaStream_t stream)
191+
{
192+
auto invoke = [&](auto t, auto max_dim_t) {
193+
using T = decltype(t);
194+
195+
constexpr int vec_size = sizeof(uint4) / sizeof(T);
196+
constexpr int max_dim = max_dim_t.value;
197+
constexpr int thr_per_qk = max_dim / vec_size;
198+
199+
FT_CHECK(head_dim % vec_size == 0);
200+
201+
const int threads = thr_per_qk * n * (int64_t)token_num;
202+
const int block_dim = 512;
203+
const int grid_dim = cdiv(threads, block_dim);
204+
205+
QkRMSNormKernel<T, float, vec_size, max_dim><<<grid_dim, block_dim, 0, stream>>>(
206+
(T*)data, ld, (const T*)weight, head_dim, n, token_num, eps, 1.f / head_dim);
207+
};
208+
209+
constexpr constant<128> max_dim{};
210+
FT_CHECK(head_dim <= max_dim);
211+
212+
switch (dtype) {
213+
case TYPE_FP16:
214+
return invoke(half{}, max_dim);
215+
case TYPE_BF16:
216+
return invoke(nv_bfloat16{}, max_dim);
217+
default:
218+
throw std::runtime_error("not implemented");
219+
}
220+
}
221+
118222
// r' <- r + (h + b)
119223
// h' <- norm(r') * w
120224
template<class T, class Tacc, int block_dim, int vec_size>

‎src/turbomind/kernels/norm/rms_norm.h

+10
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,16 @@ void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, fl
1616
invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st);
1717
}
1818

19+
void invokeQkRMSNorm(void* data,
20+
int ld,
21+
const void* weight,
22+
DataType dtype,
23+
int head_dim,
24+
int n,
25+
int token_num,
26+
float eps,
27+
cudaStream_t stream);
28+
1929
template<class T>
2030
void invokeBiasResidualRMSNorm(
2131
T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st);

‎src/turbomind/models/llama/LlamaDecoderLayerWeight.cc

+14
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int layer_id,
7474
kv_head_num_,
7575
model.mla,
7676
attn_bias_,
77+
model.qk_norm,
7778
tensor_para_size_,
7879
weight_type_,
7980
model.group_size};
@@ -380,6 +381,19 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
380381

381382
if (self_attn_weights.qkv.output_dims) {
382383
getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
384+
385+
if (self_attn_weights.qk_norm) {
386+
output.insert(concat(prefix, "attention.q_norm"),
387+
Tensor{MEMORY_GPU,
388+
getTensorType<T>(),
389+
{sizeof(T) * self_attn_weights.head_dim},
390+
self_attn_weights.q_a_layernorm});
391+
output.insert(concat(prefix, "attention.k_norm"),
392+
Tensor{MEMORY_GPU,
393+
getTensorType<T>(),
394+
{sizeof(T) * self_attn_weights.head_dim},
395+
self_attn_weights.kv_a_layernorm});
396+
}
383397
}
384398
else {
385399
getMLATensor(self_attn_weights, prefix, output, tensor_para_rank_);

‎src/turbomind/models/llama/LlamaDenseWeight.h

+14-3
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,15 @@ struct LlamaAttentionWeight {
136136
size_t kv_head_num,
137137
MLAParam mla,
138138
bool bias,
139+
bool qk_norm,
139140
size_t tp,
140141
WeightType weight_type,
141142
int group_size)
142143
{
143-
this->bias = bias;
144+
this->bias = bias;
145+
this->head_dim = head_dim;
146+
this->qk_norm = qk_norm;
147+
144148
if (mla.kv_lora_rank == 0) {
145149
qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size};
146150
}
@@ -163,8 +167,12 @@ struct LlamaAttentionWeight {
163167
{
164168
if (qkv.output_dims) {
165169
qkv.malloc(st, bias);
170+
if (qk_norm) {
171+
deviceMalloc((T**)&q_a_layernorm, head_dim, st);
172+
deviceMalloc((T**)&kv_a_layernorm, head_dim, st);
173+
}
166174
}
167-
else {
175+
else { // MLA
168176
if (q_proj.output_dims) {
169177
q_proj.malloc(st);
170178
}
@@ -193,9 +201,12 @@ struct LlamaAttentionWeight {
193201
deviceFree(kv_a_layernorm, st);
194202
}
195203

204+
int head_dim{};
205+
bool bias{};
206+
bool qk_norm{};
207+
196208
LlamaDenseWeight<T> qkv;
197209
LlamaDenseWeight<T> output;
198-
bool bias{};
199210

200211
LlamaDenseWeight<T> q_proj;
201212
LlamaDenseWeight<T> q_a_proj;

‎src/turbomind/models/llama/llama_params.h

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ struct ModelParam {
3232
WeightType weight_type;
3333
int group_size;
3434
MLAParam mla;
35+
bool qk_norm;
3536
int tune_layer_num;
3637

3738
std::vector<int> inter_size;

‎src/turbomind/models/llama/unified_attention_layer.cc

+38
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,10 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
211211
linear_->forward(
212212
qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_buf_, lora_mask);
213213
sync_check_cuda_error();
214+
215+
if (model_param_.qk_norm) {
216+
qk_norm(qkv_buf_, token_num, *weights);
217+
}
214218
}
215219
else {
216220
forward_mla(attention_input, token_num, *weights);
@@ -520,6 +524,40 @@ void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const
520524
deviceFree(kv_b, stream_);
521525
}
522526

527+
template<typename T>
528+
void UnifiedAttentionLayer<T>::qk_norm(T* qkv, int token_num, const WeightType& weights)
529+
{
530+
check_cuda_error(cudaEventRecord(qkv_event_, stream_));
531+
check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_));
532+
533+
FT_CHECK(model_param_.attn_bias == false);
534+
535+
invokeQkRMSNorm(qkv_buf_,
536+
weights.qkv.output_dims,
537+
weights.q_a_layernorm,
538+
getTensorType<T>(),
539+
size_per_head_,
540+
local_head_num_,
541+
token_num,
542+
model_param_.norm_eps,
543+
stream_);
544+
sync_check_cuda_error();
545+
546+
invokeQkRMSNorm(qkv_buf_ + size_per_head_ * local_head_num_,
547+
weights.qkv.output_dims,
548+
weights.kv_a_layernorm,
549+
getTensorType<T>(),
550+
size_per_head_,
551+
local_kv_head_num_,
552+
token_num,
553+
model_param_.norm_eps,
554+
aux_stream_);
555+
sync_check_cuda_error();
556+
557+
check_cuda_error(cudaEventRecord(aux_event_, aux_stream_));
558+
check_cuda_error(cudaStreamWaitEvent(stream_, aux_event_));
559+
}
560+
523561
#ifdef ENABLE_FP32
524562
template class UnifiedAttentionLayer<float>;
525563
#endif

‎src/turbomind/models/llama/unified_attention_layer.h

+2
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ class UnifiedAttentionLayer {
110110
private:
111111
void forward_mla(const T* inputs, int token_num, const WeightType& weights);
112112

113+
void qk_norm(T* qkv, int token_num, const WeightType& weights);
114+
113115
private:
114116
const size_t head_num_;
115117
const size_t kv_head_num_;

‎src/turbomind/triton_backend/llama/LlamaTritonModel.cc

+3-1
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t ten
235235
}
236236
// Only weight classes need these
237237
model_param_.attn_bias = model_reader["attn_bias"].as<int>(0);
238+
model_param_.qk_norm = model_reader["qk_norm"].as<bool>();
238239
model_param_.group_size = model_reader["group_size"].as<int>(0);
239240

240241
// rotary embedding parameters
@@ -471,7 +472,8 @@ std::string LlamaTritonModel<T>::toString()
471472
<< model_param_.head_dim
472473
// << "\ninter_size: " << model_param_.inter_size
473474
<< "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size
474-
<< "\nattn_bias: " << model_param_.attn_bias << "\nmax_batch_size: " << engine_param_.max_batch_size
475+
<< "\nattn_bias: " << model_param_.attn_bias << "\nqk_norm: " << model_param_.qk_norm
476+
<< "\nmax_batch_size: " << engine_param_.max_batch_size
475477
<< "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num
476478
<< "\nmax_context_token_num: " << engine_param_.max_context_token_num
477479
<< "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter

0 commit comments

Comments
 (0)
Please sign in to comment.