Skip to content

Commit a2eff87

Browse files
[0.10.2][Security] Remove structurally dead code (#444) (#460)
Cherry-pick #444 from branch releases/v0.11.0. Signed-off-by: Artur Fierka <[email protected]>
1 parent 2ac7e79 commit a2eff87

File tree

3 files changed

+0
-23
lines changed

3 files changed

+0
-23
lines changed

vllm_gaudi/extension/bucketing/common.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,6 @@ def get_filters(is_prompt, use_merged_prefill, use_contiguous_pa):
302302
return filters_map[phase][use_merged_prefill]
303303
else:
304304
return filters_map[phase][use_contiguous_pa]
305-
return []
306305

307306
buckets = set()
308307
buckets_2d = set()

vllm_gaudi/extension/ops.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,12 +1041,6 @@ def scaled_fp8_quant(
10411041
if scale is None:
10421042
raise "dynamic scaled_fp8_quant not implemented for HPU"
10431043
# TODO: calculate scale to match gaudi2 240 range instead of 448
1044-
if use_per_token_if_dynamic:
1045-
scale = torch.empty((input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32)
1046-
torch.ops._C.dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
1047-
else:
1048-
scale = torch.zeros(1, device=input.device, dtype=torch.float32)
1049-
torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
10501044
else:
10511045
output = torch.ops.hpu.cast_to_fp8_v2(input, 1 / scale, False, False, dtype=torch.float8_e4m3fn)[0]
10521046

vllm_gaudi/v1/worker/hpu_model_runner.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
6262
from vllm.v1.sample.rejection_sampler import RejectionSampler
6363
from vllm.v1.spec_decode.eagle import EagleProposer
64-
from vllm.v1.spec_decode.medusa import MedusaProposer
6564
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
6665
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
6766
from vllm.v1.sample.logits_processor import build_logitsprocs
@@ -668,7 +667,6 @@ def __init__(
668667
self.use_aux_hidden_state_outputs = True
669668
elif self.speculative_config.method == "medusa":
670669
raise NotImplementedError("Medusa speculative decoding is not supported on HPU.")
671-
self.drafter = MedusaProposer(vllm_config=self.vllm_config, device=self.device) # type: ignore
672670
else:
673671
raise ValueError("Unknown speculative decoding method: "
674672
f"{self.speculative_config.method}")
@@ -3924,20 +3922,6 @@ def __del__(self):
39243922
@torch.inference_mode()
39253923
def profile_run(self) -> None:
39263924
return
3927-
"""Profile to measure peak memory during forward pass."""
3928-
# use an empty tensor instead of `None`` to force Dynamo to pass
3929-
# it by reference, rather by specializing on the value `None`.
3930-
# the `dtype` argument does not matter, and we use `float32` as
3931-
# a placeholder (it has wide hardware support).
3932-
# it is important to create tensors inside the loop, rather than
3933-
# multiplying the list, to avoid Dynamo from treating them as
3934-
# tensor aliasing.
3935-
3936-
# Run empty prefill forwards - prefill max batch and prefill max seq
3937-
self._execute_dummy_scenario((1, self.max_model_len, 0), None)
3938-
max_seq_len = math.ceil(
3939-
(self.max_num_tokens // self.max_prefill_batch_size) / self.block_size) * self.block_size
3940-
self._execute_dummy_scenario((self.max_prefill_batch_size, max_seq_len, 0), None)
39413925

39423926
def _dummy_run(self, max_num_batched_tokens: int) -> None:
39433927
assert max_num_batched_tokens == 1

0 commit comments

Comments
 (0)