ModelTC · SangChengC · Sep 26, 2025 · Oct 22, 2025 · Nov 3, 2025 · Nov 21, 2025
diff --git a/lightllm/models/__init__.py b/lightllm/models/__init__.py
@@ -29,6 +29,8 @@
 from lightllm.models.internvl.model import InternVLInternlm2TpPartModel
 from lightllm.models.qwen2_vl.model import Qwen2VLTpPartModel
 from lightllm.models.qwen2_reward.model import Qwen2RewardTpPartModel
+from lightllm.models.qwen3_vl.model import Qwen3VLTpPartModel
+from lightllm.models.qwen3_vl_moe.model import Qwen3VLMOETpPartModel
 from lightllm.models.gemma3.model import Gemma3TpPartModel
 from lightllm.models.tarsier2.model import (
     Tarsier2Qwen2TpPartModel,

diff --git a/lightllm/models/gemma3/layer_infer/pre_layer_infer.py b/lightllm/models/gemma3/layer_infer/pre_layer_infer.py
@@ -44,7 +44,7 @@ def context_forward(self, input_ids, infer_state, layer_weight):
                     continue
                 # pull the img_embeds by uid from shm
                 data = read_shm(get_shm_name_embed(img["uuid"]))
-                img_weight.append(bytes2tensor(data).cuda().reshape(img["token_num"], -1))
+                img_weight.append(bytes2tensor(data).view(dtype).view(img["token_num"], -1).cuda(non_blocking=True))
                 img_start_token_ids.append(img["token_id"])
                 img_token_lens.append(img["token_num"])
                 img_start_locs.append(img_start_loc)

diff --git a/lightllm/models/qwen2_vl/infer_struct.py b/lightllm/models/qwen2_vl/infer_struct.py
@@ -33,8 +33,8 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
             self.position_ids = position_ids.unsqueeze(0).expand(3, -1)
 
         self.position_ids = self.position_ids.contiguous()
-        self.position_cos = model._cos_cached[self.position_ids]  # (3, L, D)
-        self.position_sin = model._sin_cached[self.position_ids]  # (3, L, D)
+        self.position_cos = model._cos_cached[self.position_ids]
+        self.position_sin = model._sin_cached[self.position_ids]
         if get_env_start_args().enable_fa3:
             self.max_seq_len = self.max_kv_seq_len
             self.q_max_seq_len = self.max_q_seq_len
@@ -66,7 +66,7 @@ def get_mrope_position(self, multimodal_params: List[dict]) -> torch.Tensor:
         b_image_thwd = torch.tensor(b_image_thwd, device="cpu").cuda(non_blocking=True)  # image_num x 4
         b_image_nums = torch.tensor(b_image_nums, device="cpu").cuda(non_blocking=True)
         b_image_start_num = torch.tensor(b_image_start_num, device="cpu").cuda(non_blocking=True)
-        b_image_len = torch.tensor(b_image_len, device=self.position_ids.device)
+        b_image_len = torch.tensor(b_image_len, device="cpu").cuda(non_blocking=True)
         position_ids = self.position_ids.unsqueeze(0).expand(3, -1).contiguous()
         get_mrope_position_triton(
             b_image_start_idx=b_image_start_idx,

diff --git a/lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py
@@ -5,31 +5,28 @@
 from typing import Tuple
 from functools import partial
 
-from lightllm.models.qwen2_vl.triton_kernel.mrope import mrope_triton
+from lightllm.models.qwen2_vl.triton_kernel.mrope import mrope_triton_fused
 from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer
 
 
 class Qwen2VLTransformerLayerInfer(LlamaTransformerLayerInfer):
     def __init__(self, layer_num, network_config, mode=[]):
         super().__init__(layer_num, network_config, mode)
-        self.mrope_section = network_config["rope_scaling"]["mrope_section"]
-        axis_map = []
-        for i, n in enumerate(self.mrope_section * 2):
-            axis_map += [i % 3] * n
-        self.axis_map = torch.tensor(axis_map, dtype=torch.int32, device="cuda")
+        mrope_section = network_config["rope_scaling"]["mrope_section"]
+        self.mrope_section = torch.tensor(mrope_section, dtype=torch.int32, device="cuda")
 
     def _get_qkv(self, input, infer_state, layer_weight):
         q = layer_weight.q_proj.mm(input)
         cache_kv = layer_weight.kv_proj.mm(input).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
-        seq_len, _ = q.shape
-        q = q.view(1, seq_len, -1, self.head_dim_).transpose(1, 2)
-        self.axis_map = self.axis_map.to(q.device)
-        k = cache_kv[:, : self.tp_k_head_num_, :].view(1, seq_len, -1, self.head_dim_).transpose(1, 2)
-        new_q, new_k = mrope_triton(q, k, infer_state.position_cos, infer_state.position_sin, self.axis_map)
-        new_q = new_q.transpose(1, 2).reshape(1, seq_len, -1)
-        cache_kv[:, : self.tp_k_head_num_, :] = new_k.squeeze(0).permute(1, 0, 2)
-
-        return new_q, cache_kv
+        mrope_triton_fused(
+            q.view(-1, self.tp_q_head_num_, self.head_dim_),
+            cache_kv[:, : self.tp_k_head_num_, :],
+            infer_state.position_cos,
+            infer_state.position_sin,
+            self.mrope_section,
+            is_interleaved=False,
+        )
+        return q, cache_kv
 
     def _tpsp_get_qkv(self, input, infer_state, layer_weight) -> Tuple[torch.Tensor, torch.Tensor]:
         # TODO

diff --git a/lightllm/models/qwen2_vl/triton_kernel/get_mrope_position_ids.py b/lightllm/models/qwen2_vl/triton_kernel/get_mrope_position_ids.py
@@ -138,7 +138,6 @@ def test():
         b_q_seq_len,
         b_start_loc,
     )
-    print(position_ids)
     """
     tensor([[0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 2, 2, 2, 2, 4, 5, 6, 7, 8],
         [0, 0, 1, 1, 2, 3, 4, 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8],