Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
defa581
[add]add whisper sdpa
SangChengC Sep 26, 2025
87c15dc
[add]add qwen3-vl-moe support
Oct 22, 2025
c318d72
fix1103
Nov 3, 2025
2ebdb58
add qwen3-vl support
Nov 21, 2025
1588ff3
1203
Dec 3, 2025
cd9c7ee
Merge branch 'main' into add-qwen3-vl
Dec 4, 2025
3ee963e
1204
Dec 4, 2025
0da89eb
1210
Dec 10, 2025
02486eb
1210
Dec 10, 2025
f6c5d64
Merge branch 'main' into add-qwen3-vl
Dec 10, 2025
1902799
1210
Dec 10, 2025
ebd5f7c
1210
Dec 10, 2025
ae29f70
1210
Dec 10, 2025
544f625
1210
Dec 10, 2025
46d2414
fix-qwen2-vl-mrope-pos-id
Dec 12, 2025
79c1fcf
mrope refactor (chunkedprefill waiting to verify)
shihaobai Dec 14, 2025
95fc1d4
fix chunked prefill
shihaobai Dec 14, 2025
29fd280
improve mrope
shihaobai Dec 15, 2025
4412556
add vlm acc benchmark
shihaobai Dec 15, 2025
c08eb0c
remove comment
shihaobai Dec 15, 2025
fd380f0
Merge branch 'qwen2-vl-mrope-fix' into add-qwen3-vl
Dec 15, 2025
e7da666
remove blocking ops
shihaobai Dec 15, 2025
f4d10cf
fix start_idx used
Dec 15, 2025
abfb4ec
fix tap
Dec 15, 2025
ee4710c
fix-mrope
Dec 15, 2025
e45189c
Merge remote-tracking branch 'origin/qwen2-vl-mrope-fix' into add-qwe…
Dec 15, 2025
b699c60
add-qwen3-vl
Dec 16, 2025
8f97e99
Merge branch 'main' into add-qwen3-vl
Dec 16, 2025
c63cae9
add-qwen3-vl
Dec 16, 2025
0e7047d
import deepstack
Dec 16, 2025
fa45ff9
add-qwen3-vl-1216
Dec 16, 2025
f5d1d60
refactor mrope
shihaobai Dec 16, 2025
ce02b13
Merge branch 'add-qwen2-vl' of https://github.com/ModelTC/lightllm in…
shihaobai Dec 16, 2025
49c949f
add-qwen3-vl1216
Dec 16, 2025
8d33f1a
fix
shihaobai Dec 16, 2025
884c227
Merge branch 'add-qwen3-vl' of https://github.com/ModelTC/lightllm in…
shihaobai Dec 16, 2025
dc2aad9
fix
shihaobai Dec 16, 2025
4519d57
add-qwen3-vl-1216
Dec 16, 2025
057bb1d
openai samping params
shihaobai Dec 16, 2025
5d1baef
Merge branch 'add-qwen3-vl' of https://github.com/ModelTC/lightllm in…
shihaobai Dec 16, 2025
b59dc5b
remove qwen2-vl resize
Dec 17, 2025
88c33c6
fix deepstack
Dec 17, 2025
d249aaf
fix cuda
Dec 17, 2025
427c5e8
fix-qwen3-vl-1217
Dec 17, 2025
eaab652
update tensor2bytes
shihaobai Dec 17, 2025
dadf600
merge
shihaobai Dec 17, 2025
16282af
fix
shihaobai Dec 17, 2025
841867d
fix
shihaobai Dec 17, 2025
1321f2e
Merge branch 'main' into add-qwen3-vl
shihaobai Dec 18, 2025
08a3484
refactor mrope
shihaobai Dec 18, 2025
a0c8bf0
qwen3 moe
shihaobai Dec 18, 2025
6df4156
refactor weight
shihaobai Dec 18, 2025
e9e5025
fix
shihaobai Dec 18, 2025
bda9b67
add embed cache one
Dec 18, 2025
042a26b
fix
hiworldwzj Dec 18, 2025
60fc7f5
fix
hiworldwzj Dec 18, 2025
2aec5a1
fix
hiworldwzj Dec 18, 2025
b298257
fix whisper
hiworldwzj Dec 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lightllm/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from lightllm.models.internvl.model import InternVLInternlm2TpPartModel
from lightllm.models.qwen2_vl.model import Qwen2VLTpPartModel
from lightllm.models.qwen2_reward.model import Qwen2RewardTpPartModel
from lightllm.models.qwen3_vl.model import Qwen3VLTpPartModel
from lightllm.models.qwen3_vl_moe.model import Qwen3VLMOETpPartModel
from lightllm.models.gemma3.model import Gemma3TpPartModel
from lightllm.models.tarsier2.model import (
Tarsier2Qwen2TpPartModel,
Expand Down
2 changes: 1 addition & 1 deletion lightllm/models/gemma3/layer_infer/pre_layer_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def context_forward(self, input_ids, infer_state, layer_weight):
continue
# pull the img_embeds by uid from shm
data = read_shm(get_shm_name_embed(img["uuid"]))
img_weight.append(bytes2tensor(data).cuda().reshape(img["token_num"], -1))
img_weight.append(bytes2tensor(data).view(dtype).view(img["token_num"], -1).cuda(non_blocking=True))
img_start_token_ids.append(img["token_id"])
img_token_lens.append(img["token_num"])
img_start_locs.append(img_start_loc)
Expand Down
6 changes: 3 additions & 3 deletions lightllm/models/qwen2_vl/infer_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
self.position_ids = position_ids.unsqueeze(0).expand(3, -1)

self.position_ids = self.position_ids.contiguous()
self.position_cos = model._cos_cached[self.position_ids] # (3, L, D)
self.position_sin = model._sin_cached[self.position_ids] # (3, L, D)
self.position_cos = model._cos_cached[self.position_ids]
self.position_sin = model._sin_cached[self.position_ids]
if get_env_start_args().enable_fa3:
self.max_seq_len = self.max_kv_seq_len
self.q_max_seq_len = self.max_q_seq_len
Expand Down Expand Up @@ -66,7 +66,7 @@ def get_mrope_position(self, multimodal_params: List[dict]) -> torch.Tensor:
b_image_thwd = torch.tensor(b_image_thwd, device="cpu").cuda(non_blocking=True) # image_num x 4
b_image_nums = torch.tensor(b_image_nums, device="cpu").cuda(non_blocking=True)
b_image_start_num = torch.tensor(b_image_start_num, device="cpu").cuda(non_blocking=True)
b_image_len = torch.tensor(b_image_len, device=self.position_ids.device)
b_image_len = torch.tensor(b_image_len, device="cpu").cuda(non_blocking=True)
position_ids = self.position_ids.unsqueeze(0).expand(3, -1).contiguous()
get_mrope_position_triton(
b_image_start_idx=b_image_start_idx,
Expand Down
27 changes: 12 additions & 15 deletions lightllm/models/qwen2_vl/layer_infer/transformer_layer_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,28 @@
from typing import Tuple
from functools import partial

from lightllm.models.qwen2_vl.triton_kernel.mrope import mrope_triton
from lightllm.models.qwen2_vl.triton_kernel.mrope import mrope_triton_fused
from lightllm.models.llama.layer_infer.transformer_layer_infer import LlamaTransformerLayerInfer


class Qwen2VLTransformerLayerInfer(LlamaTransformerLayerInfer):
def __init__(self, layer_num, network_config, mode=[]):
super().__init__(layer_num, network_config, mode)
self.mrope_section = network_config["rope_scaling"]["mrope_section"]
axis_map = []
for i, n in enumerate(self.mrope_section * 2):
axis_map += [i % 3] * n
self.axis_map = torch.tensor(axis_map, dtype=torch.int32, device="cuda")
mrope_section = network_config["rope_scaling"]["mrope_section"]
self.mrope_section = torch.tensor(mrope_section, dtype=torch.int32, device="cuda")

def _get_qkv(self, input, infer_state, layer_weight):
q = layer_weight.q_proj.mm(input)
cache_kv = layer_weight.kv_proj.mm(input).view(-1, (self.tp_k_head_num_ + self.tp_v_head_num_), self.head_dim_)
seq_len, _ = q.shape
q = q.view(1, seq_len, -1, self.head_dim_).transpose(1, 2)
self.axis_map = self.axis_map.to(q.device)
k = cache_kv[:, : self.tp_k_head_num_, :].view(1, seq_len, -1, self.head_dim_).transpose(1, 2)
new_q, new_k = mrope_triton(q, k, infer_state.position_cos, infer_state.position_sin, self.axis_map)
new_q = new_q.transpose(1, 2).reshape(1, seq_len, -1)
cache_kv[:, : self.tp_k_head_num_, :] = new_k.squeeze(0).permute(1, 0, 2)

return new_q, cache_kv
mrope_triton_fused(
q.view(-1, self.tp_q_head_num_, self.head_dim_),
cache_kv[:, : self.tp_k_head_num_, :],
infer_state.position_cos,
infer_state.position_sin,
self.mrope_section,
is_interleaved=False,
)
return q, cache_kv

def _tpsp_get_qkv(self, input, infer_state, layer_weight) -> Tuple[torch.Tensor, torch.Tensor]:
# TODO
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ def test():
b_q_seq_len,
b_start_loc,
)
print(position_ids)
"""
tensor([[0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 2, 2, 2, 2, 4, 5, 6, 7, 8],
[0, 0, 1, 1, 2, 3, 4, 0, 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 8],
Expand Down
Loading