|
61 | 61 | from vllm.v1.worker.utils import (gather_mm_placeholders, sanity_check_mm_encoder_outputs, scatter_mm_placeholders) |
62 | 62 | from vllm.v1.sample.rejection_sampler import RejectionSampler |
63 | 63 | from vllm.v1.spec_decode.eagle import EagleProposer |
64 | | -from vllm.v1.spec_decode.medusa import MedusaProposer |
65 | 64 | from vllm.v1.spec_decode.metadata import SpecDecodeMetadata |
66 | 65 | from vllm.v1.spec_decode.ngram_proposer import NgramProposer |
67 | 66 | from vllm.v1.sample.logits_processor import build_logitsprocs |
@@ -668,7 +667,6 @@ def __init__( |
668 | 667 | self.use_aux_hidden_state_outputs = True |
669 | 668 | elif self.speculative_config.method == "medusa": |
670 | 669 | raise NotImplementedError("Medusa speculative decoding is not supported on HPU.") |
671 | | - self.drafter = MedusaProposer(vllm_config=self.vllm_config, device=self.device) # type: ignore |
672 | 670 | else: |
673 | 671 | raise ValueError("Unknown speculative decoding method: " |
674 | 672 | f"{self.speculative_config.method}") |
@@ -3924,20 +3922,6 @@ def __del__(self): |
3924 | 3922 | @torch.inference_mode() |
3925 | 3923 | def profile_run(self) -> None: |
3926 | 3924 | return |
3927 | | - """Profile to measure peak memory during forward pass.""" |
3928 | | - # use an empty tensor instead of `None`` to force Dynamo to pass |
3929 | | - # it by reference, rather by specializing on the value `None`. |
3930 | | - # the `dtype` argument does not matter, and we use `float32` as |
3931 | | - # a placeholder (it has wide hardware support). |
3932 | | - # it is important to create tensors inside the loop, rather than |
3933 | | - # multiplying the list, to avoid Dynamo from treating them as |
3934 | | - # tensor aliasing. |
3935 | | - |
3936 | | - # Run empty prefill forwards - prefill max batch and prefill max seq |
3937 | | - self._execute_dummy_scenario((1, self.max_model_len, 0), None) |
3938 | | - max_seq_len = math.ceil( |
3939 | | - (self.max_num_tokens // self.max_prefill_batch_size) / self.block_size) * self.block_size |
3940 | | - self._execute_dummy_scenario((self.max_prefill_batch_size, max_seq_len, 0), None) |
3941 | 3925 |
|
3942 | 3926 | def _dummy_run(self, max_num_batched_tokens: int) -> None: |
3943 | 3927 | assert max_num_batched_tokens == 1 |
|
0 commit comments