Add Qwen3 and Qwen3MoE (#3305)

lzhangzz · web-flow · commit 2f38c51eb737 · 2025-03-25T09:29:08.000+08:00
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -47,6 +47,7 @@ class ModelConfig:
     inter_size: List[int] = None
     norm_eps: float = None
     attn_bias: int = 0
+    qk_norm: bool = False
     size_per_head: int = 128
     group_size: int = 64
     weight_type: str = None
diff --git a/lmdeploy/turbomind/deploy/module.py b/lmdeploy/turbomind/deploy/module.py
@@ -169,6 +169,7 @@ def __init__(self, model: BaseOutputModel):
         self.tp = model.tensor_para_size
         self.head_dim = model.model_config.size_per_head
         self.attn_bias = model.model_config.attn_bias
+        self.qk_norm = model.model_config.qk_norm
 
     def _reorder_and_merge(self, qkvo):
         q, k, v, o = qkvo
@@ -220,6 +221,13 @@ def _export(self, idx: int, qkvo, kind: str, pack_fn, **kwargs):
     def apply(self, i: int, r: BaseReader):
         for e in get_params(r.attn(i, None), bias=self.attn_bias):
             e(self._export, partial(r.attn, i), i)
+        if self.qk_norm:
+            q, k = r.qk_norm(i)
+            if self.model.permute_qk:
+                q = permute_v2(q, self.head_dim)
+                k = permute_v2(k, self.head_dim)
+            self.model.save_split(q, self._attn.format(i, 'q_norm', '')[:-1])
+            self.model.save_split(k, self._attn.format(i, 'k_norm', '')[:-1])
 
 
 class MLA(Module):
diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -149,7 +149,47 @@ def model_info(self):
         info['expert_inter_size'] = cfg['moe_intermediate_size']
         info['experts_per_token'] = cfg['num_experts_per_tok']
         info['inter_size'] = cfg['shared_expert_intermediate_size']
-        info['moe_shared_gate'] = True
+        info['moe_shared_gate'] = info['inter_size'] > 0
         info['norm_topk_prob'] = cfg['norm_topk_prob']
-        info['attn_bias'] = 1
+        info['attn_bias'] = cfg.get('attention_bias', 1)
+        return info
+
+
+class Qwen3Reader(LlamaReader):
+
+    def qk_norm(self, i: int):
+        result = []
+        for x in ['q', 'k']:
+            name = f'{self.attn_layer_prefix}.{i}.self_attn.{x}_norm.weight'
+            result.append(self.params.get(name))
+        return (*result, )
+
+
+@INPUT_MODELS.register_module(name='qwen3')
+class Qwen3Model(LlamaModel):
+    Reader = Qwen3Reader
+
+    def model_info(self):
+        info = super().model_info()
+        info['qk_norm'] = True
+        return info
+
+
+class Qwen3MoeReader(Qwen2MoeReader):
+
+    def qk_norm(self, i: int):
+        result = []
+        for x in ['q', 'k']:
+            name = f'{self.attn_layer_prefix}.{i}.self_attn.{x}_norm.weight'
+            result.append(self.params.get(name))
+        return (*result, )
+
+
+@INPUT_MODELS.register_module(name='qwen3-moe')
+class Qwen3MoeModel(Qwen2MoeModel):
+    Reader = Qwen3MoeReader
+
+    def model_info(self):
+        info = super().model_info()
+        info['qk_norm'] = True
         return info
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
@@ -23,6 +23,9 @@
     # Qwen2
     Qwen2ForCausalLM='qwen2',
     Qwen2MoeForCausalLM='qwen2-moe',
+    # Qwen3
+    Qwen3ForCausalLM='qwen3',
+    Qwen3MoeForCausalLM='qwen3-moe',
     # mistral
     MistralForCausalLM='llama',
     # llava
diff --git a/src/turbomind/kernels/norm/rms_norm.cu b/src/turbomind/kernels/norm/rms_norm.cu
@@ -1,10 +1,16 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 
+#include <stdexcept>
+
 #include "cub/block/block_reduce.cuh"
 
 #include "src/turbomind/kernels/core/array_ops.h"
 #include "src/turbomind/kernels/core/common.h"
+#include "src/turbomind/kernels/core/math.h"
+#include "src/turbomind/kernels/core/meta.h"
+
 #include "src/turbomind/kernels/norm/rms_norm.h"
+#include "src/turbomind/utils/Tensor.h"
 
 namespace turbomind {
 
@@ -115,6 +121,104 @@ template void invokeRMSNorm(nv_bfloat16*       dst,
                             cudaStream_t       st);
 #endif
 
+template<class T, class A, int vec_size, int max_dim>
+__global__ void QkRMSNormKernel(T*       data,  //
+                                int      ld,
+                                const T* weight,
+                                int      dim,
+                                int      n,
+                                int      token_num,
+                                float    eps,
+                                float    inv_dim)
+{
+    static_assert((max_dim & (max_dim - 1)) == 0);
+
+    constexpr int thr_per_qk = max_dim / vec_size;
+
+    const int bi = (threadIdx.x + blockIdx.x * blockDim.x) / thr_per_qk;
+    const int di = threadIdx.x % thr_per_qk * vec_size;
+    const int ti = bi / n;
+    const int hi = bi % n;
+
+    if (bi >= token_num * n) {
+        return;
+    }
+
+    data += ti * ld + hi * dim;
+
+    Array<T, vec_size> vec{};
+    if (di < dim) {
+        Load(vec, &data[di]);
+    }
+
+    using namespace ops;
+    auto acc = cast<A>(vec);
+    acc      = acc * acc;
+
+    float sum{};
+    PRAGMA_UNROLL
+    for (int i = 0; i < vec_size; ++i) {
+        sum += acc[i];
+    }
+
+    PRAGMA_UNROLL
+    for (int mask = thr_per_qk / 2; mask >= 1; mask /= 2) {
+        sum += __shfl_xor_sync((uint32_t)-1, sum, mask);
+    }
+
+    sum = rsqrtf(sum * inv_dim + eps);
+
+    Array<T, vec_size> w;
+    if (di < dim) {
+        Ldg(w, &weight[di]);
+        PRAGMA_UNROLL
+        for (int i = 0; i < vec_size; ++i) {
+            vec[i] = (T)((float)vec[i] * sum) * w[i];
+        }
+        Store(&data[di], vec);
+    }
+}
+
+void invokeQkRMSNorm(void*        data,
+                     int          ld,
+                     const void*  weight,
+                     DataType     dtype,
+                     int          head_dim,
+                     int          n,
+                     int          token_num,
+                     float        eps,
+                     cudaStream_t stream)
+{
+    auto invoke = [&](auto t, auto max_dim_t) {
+        using T = decltype(t);
+
+        constexpr int vec_size   = sizeof(uint4) / sizeof(T);
+        constexpr int max_dim    = max_dim_t.value;
+        constexpr int thr_per_qk = max_dim / vec_size;
+
+        FT_CHECK(head_dim % vec_size == 0);
+
+        const int threads   = thr_per_qk * n * (int64_t)token_num;
+        const int block_dim = 512;
+        const int grid_dim  = cdiv(threads, block_dim);
+
+        QkRMSNormKernel<T, float, vec_size, max_dim><<<grid_dim, block_dim, 0, stream>>>(
+            (T*)data, ld, (const T*)weight, head_dim, n, token_num, eps, 1.f / head_dim);
+    };
+
+    constexpr constant<128> max_dim{};
+    FT_CHECK(head_dim <= max_dim);
+
+    switch (dtype) {
+        case TYPE_FP16:
+            return invoke(half{}, max_dim);
+        case TYPE_BF16:
+            return invoke(nv_bfloat16{}, max_dim);
+        default:
+            throw std::runtime_error("not implemented");
+    }
+}
+
 // r' <- r + (h + b)
 // h' <- norm(r') * w
 template<class T, class Tacc, int block_dim, int vec_size>
diff --git a/src/turbomind/kernels/norm/rms_norm.h b/src/turbomind/kernels/norm/rms_norm.h
@@ -16,6 +16,16 @@ void invokeRMSNorm(T* dst, const T* src, const T* weights, int dims, int num, fl
     invokeRMSNorm(dst, dims, src, dims, weights, dims, num, eps, st);
 }
 
+void invokeQkRMSNorm(void*        data,
+                     int          ld,
+                     const void*  weight,
+                     DataType     dtype,
+                     int          head_dim,
+                     int          n,
+                     int          token_num,
+                     float        eps,
+                     cudaStream_t stream);
+
 template<class T>
 void invokeBiasResidualRMSNorm(
     T* residual, T* hidden_states, const T* weights, const T* bias, int dims, int num, float eps, cudaStream_t st);
diff --git a/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc b/src/turbomind/models/llama/LlamaDecoderLayerWeight.cc
@@ -74,6 +74,7 @@ LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(int               layer_id,
                                                 kv_head_num_,
                                                 model.mla,
                                                 attn_bias_,
+                                                model.qk_norm,
                                                 tensor_para_size_,
                                                 weight_type_,
                                                 model.group_size};
@@ -380,6 +381,19 @@ TensorMap LlamaDecoderLayerWeight<T>::getParams(std::string prefix)
 
     if (self_attn_weights.qkv.output_dims) {
         getWeightTensor(self_attn_weights.qkv, attn_bias_, get_prefix("attention.w_qkv"), output);
+
+        if (self_attn_weights.qk_norm) {
+            output.insert(concat(prefix, "attention.q_norm"),
+                          Tensor{MEMORY_GPU,
+                                 getTensorType<T>(),
+                                 {sizeof(T) * self_attn_weights.head_dim},
+                                 self_attn_weights.q_a_layernorm});
+            output.insert(concat(prefix, "attention.k_norm"),
+                          Tensor{MEMORY_GPU,
+                                 getTensorType<T>(),
+                                 {sizeof(T) * self_attn_weights.head_dim},
+                                 self_attn_weights.kv_a_layernorm});
+        }
     }
     else {
         getMLATensor(self_attn_weights, prefix, output, tensor_para_rank_);
diff --git a/src/turbomind/models/llama/LlamaDenseWeight.h b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -136,11 +136,15 @@ struct LlamaAttentionWeight {
                          size_t     kv_head_num,
                          MLAParam   mla,
                          bool       bias,
+                         bool       qk_norm,
                          size_t     tp,
                          WeightType weight_type,
                          int        group_size)
     {
-        this->bias = bias;
+        this->bias     = bias;
+        this->head_dim = head_dim;
+        this->qk_norm  = qk_norm;
+
         if (mla.kv_lora_rank == 0) {
             qkv = {hidden_dim, (head_num + 2 * kv_head_num) * head_dim / tp, weight_type, group_size};
         }
@@ -163,8 +167,12 @@ struct LlamaAttentionWeight {
     {
         if (qkv.output_dims) {
             qkv.malloc(st, bias);
+            if (qk_norm) {
+                deviceMalloc((T**)&q_a_layernorm, head_dim, st);
+                deviceMalloc((T**)&kv_a_layernorm, head_dim, st);
+            }
         }
-        else {
+        else {  // MLA
             if (q_proj.output_dims) {
                 q_proj.malloc(st);
             }
@@ -193,9 +201,12 @@ struct LlamaAttentionWeight {
         deviceFree(kv_a_layernorm, st);
     }
 
+    int  head_dim{};
+    bool bias{};
+    bool qk_norm{};
+
     LlamaDenseWeight<T> qkv;
     LlamaDenseWeight<T> output;
-    bool                bias{};
 
     LlamaDenseWeight<T> q_proj;
     LlamaDenseWeight<T> q_a_proj;
diff --git a/src/turbomind/models/llama/llama_params.h b/src/turbomind/models/llama/llama_params.h
@@ -32,6 +32,7 @@ struct ModelParam {
     WeightType weight_type;
     int        group_size;
     MLAParam   mla;
+    bool       qk_norm;
     int        tune_layer_num;
 
     std::vector<int> inter_size;
diff --git a/src/turbomind/models/llama/unified_attention_layer.cc b/src/turbomind/models/llama/unified_attention_layer.cc
@@ -211,6 +211,10 @@ inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMa
         linear_->forward(
             qkv_buf_, attention_input, token_num, weights->qkv, LlamaLinear<T>::kGemm, lora_buf_, lora_mask);
         sync_check_cuda_error();
+
+        if (model_param_.qk_norm) {
+            qk_norm(qkv_buf_, token_num, *weights);
+        }
     }
     else {
         forward_mla(attention_input, token_num, *weights);
@@ -520,6 +524,40 @@ void UnifiedAttentionLayer<T>::forward_mla(const T* inputs, int token_num, const
     deviceFree(kv_b, stream_);
 }
 
+template<typename T>
+void UnifiedAttentionLayer<T>::qk_norm(T* qkv, int token_num, const WeightType& weights)
+{
+    check_cuda_error(cudaEventRecord(qkv_event_, stream_));
+    check_cuda_error(cudaStreamWaitEvent(aux_stream_, qkv_event_));
+
+    FT_CHECK(model_param_.attn_bias == false);
+
+    invokeQkRMSNorm(qkv_buf_,
+                    weights.qkv.output_dims,
+                    weights.q_a_layernorm,
+                    getTensorType<T>(),
+                    size_per_head_,
+                    local_head_num_,
+                    token_num,
+                    model_param_.norm_eps,
+                    stream_);
+    sync_check_cuda_error();
+
+    invokeQkRMSNorm(qkv_buf_ + size_per_head_ * local_head_num_,
+                    weights.qkv.output_dims,
+                    weights.kv_a_layernorm,
+                    getTensorType<T>(),
+                    size_per_head_,
+                    local_kv_head_num_,
+                    token_num,
+                    model_param_.norm_eps,
+                    aux_stream_);
+    sync_check_cuda_error();
+
+    check_cuda_error(cudaEventRecord(aux_event_, aux_stream_));
+    check_cuda_error(cudaStreamWaitEvent(stream_, aux_event_));
+}
+
 #ifdef ENABLE_FP32
 template class UnifiedAttentionLayer<float>;
 #endif
diff --git a/src/turbomind/models/llama/unified_attention_layer.h b/src/turbomind/models/llama/unified_attention_layer.h
@@ -110,6 +110,8 @@ class UnifiedAttentionLayer {
 private:
     void forward_mla(const T* inputs, int token_num, const WeightType& weights);
 
+    void qk_norm(T* qkv, int token_num, const WeightType& weights);
+
 private:
     const size_t head_num_;
     const size_t kv_head_num_;
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -235,6 +235,7 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t                                 ten
     }
     // Only weight classes need these
     model_param_.attn_bias  = model_reader["attn_bias"].as<int>(0);
+    model_param_.qk_norm    = model_reader["qk_norm"].as<bool>();
     model_param_.group_size = model_reader["group_size"].as<int>(0);
 
     // rotary embedding parameters
@@ -471,7 +472,8 @@ std::string LlamaTritonModel<T>::toString()
        << model_param_.head_dim
        //    << "\ninter_size: " << model_param_.inter_size
        << "\nnum_layer: " << model_param_.layer_num << "\nvocab_size: " << model_param_.vocab_size
-       << "\nattn_bias: " << model_param_.attn_bias << "\nmax_batch_size: " << engine_param_.max_batch_size
+       << "\nattn_bias: " << model_param_.attn_bias << "\nqk_norm: " << model_param_.qk_norm
+       << "\nmax_batch_size: " << engine_param_.max_batch_size
        << "\nmax_prefill_token_num: " << engine_param_.max_prefill_token_num
        << "\nmax_context_token_num: " << engine_param_.max_context_token_num
        << "\nnum_tokens_per_iter: " << engine_param_.num_tokens_per_iter