Skip to content

Commit 8517252

Browse files
jikunshangyma11
andauthored
[Hardware][intel GPU] bump up ipex version to 2.3 (vllm-project#8365)
Co-authored-by: Yan Ma <[email protected]>
1 parent 9ba0817 commit 8517252

File tree

6 files changed

+60
-87
lines changed

6 files changed

+60
-87
lines changed

Dockerfile.xpu

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,23 @@
1-
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
1+
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
22

33
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
44
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
55
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
6-
rm /etc/apt/sources.list.d/intel-graphics.list && \
76
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
87
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
98
chmod 644 /usr/share/keyrings/intel-graphics.gpg
109

1110
RUN apt-get update -y \
1211
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
12+
13+
RUN git clone https://github.com/intel/pti-gpu && \
14+
cd pti-gpu/sdk && \
15+
mkdir build && \
16+
cd build && \
17+
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
18+
make -j && \
19+
cmake --install . --config Release --prefix "/usr/local"
20+
1321
COPY ./ /workspace/vllm
1422

1523
WORKDIR /workspace/vllm

requirements-xpu.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33

44
setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
55

6-
torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
7-
intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
8-
oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
6+
torch == 2.3.1+cxx11.abi
7+
intel-extension-for-pytorch == 2.3.110+xpu
8+
oneccl_bind_pt == 2.3.100+xpu
99

10-
triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
10+
triton-xpu == 3.0.0b2
1111

12+
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/

vllm/_ipex_ops.py

Lines changed: 29 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -27,29 +27,27 @@ def _reshape_activation_tensor(
2727

2828
@staticmethod
2929
def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
30-
x1, x2 = ipex_ops._reshape_activation_tensor(x)
31-
ipex.llm.functional.silu_mul(x1, x2, out)
30+
ipex.llm.functional.silu_and_mul(x, out)
3231

3332
@staticmethod
3433
def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
35-
x1, x2 = ipex_ops._reshape_activation_tensor(x)
36-
ipex.llm.functional.gelu_mul(x1, x2, out, "none")
34+
ipex.llm.functional.gelu_and_mul(x, out)
3735

3836
@staticmethod
3937
def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
40-
x1, x2 = ipex_ops._reshape_activation_tensor(x)
41-
ipex.llm.functional.gelu_mul(x1, x2, out, "tanh")
38+
ipex.llm.functional.gelu_and_mul(x, out)
4239

4340
@staticmethod
44-
def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
45-
out.copy_(torch.nn.functional.gelu(x))
41+
def gelu_fast(x: torch.Tensor) -> torch.Tensor:
42+
return torch.nn.functional.gelu(x)
4643

4744
@staticmethod
48-
def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
49-
out.copy_(torch.nn.functional.gelu(x))
45+
def gelu_new(x: torch.Tensor) -> torch.Tensor:
46+
return torch.nn.functional.gelu(x)
5047

51-
# TODO add implementation of gelu_quick here
52-
# def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
48+
@staticmethod
49+
def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
50+
ipex.llm.functional.gelu_quick(x, out)
5351

5452
@staticmethod
5553
def paged_attention_v1(
@@ -160,67 +158,26 @@ def rotary_embedding(
160158
cos_sin_cache: torch.Tensor, # [cos_sin_dim, rot_dim]
161159
is_neox: bool,
162160
) -> None:
163-
if positions.dim() == 1:
164-
positions = positions.unsqueeze(0)
165-
query = query.unsqueeze(0)
166-
key = key.unsqueeze(0)
167-
168-
rotary_dim = cos_sin_cache.size(1)
169-
query = query.view(*query.shape[:-1], -1, head_size)
170-
key = key.view(*key.shape[:-1], -1, head_size)
171-
172-
query_rot = query[..., :rotary_dim]
173-
key_rot = key[..., :rotary_dim]
174-
175-
cos_sin = cos_sin_cache[positions.long()]
176-
cos, sin = cos_sin.chunk(2, dim=-1)
177-
178-
if is_neox:
179-
cos = cos.repeat(1, 1, 2).unsqueeze(-2)
180-
sin = sin.repeat(1, 1, 2).unsqueeze(-2)
181-
else:
182-
cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
183-
sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
184-
ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
185-
rotary_dim, is_neox, positions)
161+
rot_dim = cos_sin_cache.size(1)
162+
ipex.llm.functional.rotary_embedding_batched(positions, query, key,
163+
head_size, cos_sin_cache,
164+
is_neox, rot_dim)
186165

187166
@staticmethod
188167
def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
189168
key: torch.Tensor, head_size: int,
190169
cos_sin_cache: torch.Tensor, is_neox: bool,
191170
rot_dim: int,
192171
cos_sin_cache_offsets: torch.Tensor) -> None:
193-
if positions.dim() == 1:
194-
positions = positions.unsqueeze(0)
195-
query = query.unsqueeze(0)
196-
key = key.unsqueeze(0)
197-
cos_sin_cache_offsets = cos_sin_cache_offsets.view_as(positions)
198-
rotary_dim = cos_sin_cache.size(1)
199-
query = query.view(*query.shape[:-1], -1, head_size)
200-
key = key.view(*key.shape[:-1], -1, head_size)
201-
202-
query_rot = query[..., :rotary_dim]
203-
key_rot = key[..., :rotary_dim]
204-
205-
cos_sin = cos_sin_cache[torch.add(positions,
206-
cos_sin_cache_offsets).long()]
207-
cos, sin = cos_sin.chunk(2, dim=-1)
208-
209-
if is_neox:
210-
cos = cos.repeat(1, 1, 2).unsqueeze(-2)
211-
sin = sin.repeat(1, 1, 2).unsqueeze(-2)
212-
else:
213-
cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
214-
sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
215-
216-
ipex.llm.functional.rotary_embedding(query_rot, key_rot, sin, cos,
217-
rotary_dim, is_neox, positions)
172+
ipex.llm.functional.rotary_embedding_batched(positions, query, key,
173+
head_size, cos_sin_cache,
174+
is_neox, rot_dim,
175+
cos_sin_cache_offsets)
218176

219177
@staticmethod
220-
def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
221-
epsilon: float) -> None:
222-
tmp = ipex.llm.functional.rms_norm(input, weight, epsilon)
223-
out.copy_(tmp)
178+
def rms_norm(input: torch.Tensor, weight: torch.Tensor,
179+
epsilon: float) -> torch.Tensor:
180+
return ipex.llm.functional.rms_norm(input, weight, epsilon)
224181

225182
@staticmethod
226183
def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
@@ -246,11 +203,14 @@ def varlen_attention(
246203
return_softmax: bool,
247204
gen_: torch.Generator,
248205
) -> None:
249-
ipex.llm.functional.varlen_attention(query, key, value, out, seqlen_q,
250-
seqlen_k, max_seqlen_q,
251-
max_seqlen_k, pdropout,
252-
softmax_scale, zero_tensors,
253-
is_causal, return_softmax, gen_)
206+
ipex.llm.functional.varlen_attention(query.contiguous(),
207+
key.contiguous(),
208+
value.contiguous(), out,
209+
seqlen_q.int(), seqlen_k.int(),
210+
max_seqlen_q, max_seqlen_k,
211+
pdropout, softmax_scale,
212+
zero_tensors, is_causal,
213+
return_softmax, gen_)
254214

255215
@staticmethod
256216
def reshape_and_cache(

vllm/attention/backends/ipex_attn.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,18 @@ def swap_blocks(
4949
dst_kv_cache: torch.Tensor,
5050
src_to_dst: torch.Tensor,
5151
) -> None:
52-
PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
52+
from vllm._ipex_ops import ipex_ops as ops
53+
ops.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
5354

5455
@staticmethod
5556
def copy_blocks(
5657
kv_caches: List[torch.Tensor],
5758
src_to_dists: torch.Tensor,
5859
) -> None:
59-
PagedAttention.copy_blocks(kv_caches, src_to_dists)
60+
from vllm._ipex_ops import ipex_ops as ops
61+
key_caches = [kv_cache[0] for kv_cache in kv_caches]
62+
value_caches = [kv_cache[1] for kv_cache in kv_caches]
63+
ops.copy_blocks(key_caches, value_caches, src_to_dists)
6064

6165

6266
@dataclass

vllm/model_executor/layers/activation.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,7 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
114114
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
115115
from vllm._ipex_ops import ipex_ops as ops
116116

117-
out = torch.empty_like(x)
118-
ops.gelu_new(out, x)
119-
return out
117+
return ops.gelu_new(x)
120118

121119

122120
class FastGELU(CustomOp):
@@ -136,9 +134,7 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
136134
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
137135
from vllm._ipex_ops import ipex_ops as ops
138136

139-
out = torch.empty_like(x)
140-
ops.gelu_fast(out, x)
141-
return out
137+
return ops.gelu_fast(x)
142138

143139

144140
class QuickGELU(CustomOp):
@@ -155,6 +151,13 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
155151
ops.gelu_quick(out, x)
156152
return out
157153

154+
def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
155+
from vllm._ipex_ops import ipex_ops as ops
156+
157+
out = torch.empty_like(x)
158+
ops.gelu_quick(out, x)
159+
return out
160+
158161
# TODO implement forward_xpu for QuickGELU
159162
# def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
160163

vllm/model_executor/layers/layernorm.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,11 @@ def forward_xpu(
8282
self.variance_epsilon,
8383
)
8484
return x, residual
85-
out = torch.empty_like(x)
86-
ops.rms_norm(
87-
out,
85+
return ops.rms_norm(
8886
x,
8987
self.weight.data,
9088
self.variance_epsilon,
9189
)
92-
return out
9390

9491
def extra_repr(self) -> str:
9592
s = f"hidden_size={self.weight.data.size(0)}"

0 commit comments

Comments
 (0)