diff --git a/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_neuron_scheduler.py b/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_neuron_scheduler.py index 06dd2ee0e..5f62fe5f7 100644 --- a/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_neuron_scheduler.py +++ b/engines/python/setup/djl_python/transformers_neuronx_scheduler/optimum_neuron_scheduler.py @@ -177,8 +177,9 @@ def make_generations( slot_request.slot.clear() return generation, finish_reason - def _generate_token(self, inputs: GenerationInputs, - prefill: Optional[bool]) -> List[Generation]: + def _generate_token(self, + inputs: GenerationInputs, + prefill: Optional[bool] = None) -> List[Generation]: """Prepare inputs for batching strategy Args: inputs (GenerationInputs): inputs tokenized tensor values diff --git a/engines/python/setup/djl_python/transformers_neuronx_scheduler/slot.py b/engines/python/setup/djl_python/transformers_neuronx_scheduler/slot.py index 751eaa58f..c4bcbbec6 100644 --- a/engines/python/setup/djl_python/transformers_neuronx_scheduler/slot.py +++ b/engines/python/setup/djl_python/transformers_neuronx_scheduler/slot.py @@ -141,7 +141,7 @@ def assign(self, request: Request, generation_config: GenerationConfig, if self._generation_config.do_sample: self._generation_config.temperature = param.get("temperature", 0.9) self._generation_config.top_k = param.get("top_k", 0) - self._generation_config.top_p = param.get("top_p", 1.0) + self._generation_config.top_p = param.get("top_p", 0.9) self._generation_config.typical_p = param.get("typical_p", 1.0) self.seed = int(param.get("seed", 0)) diff --git a/tests/integration/tests.py b/tests/integration/tests.py index ae9e73e24..d2c042a4a 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -727,30 +727,6 @@ def test_gpt2_quantize(self): r.launch(container='pytorch-inf2-1') client.run("transformers_neuronx gpt2-quantize".split()) - def test_opt_1_3b(self): - with Runner('pytorch-inf2', 'opt-1.3b') as r: - prepare.build_transformers_neuronx_handler_model("opt-1.3b") - r.launch(container='pytorch-inf2-6') - client.run("transformers_neuronx opt-1.3b".split()) - - def test_gpt_j_6b(self): - with Runner('pytorch-inf2', 'gpt-j-6b') as r: - prepare.build_transformers_neuronx_handler_model("gpt-j-6b") - r.launch(container='pytorch-inf2-6') - client.run("transformers_neuronx gpt-j-6b".split()) - - def test_pythia(self): - with Runner('pytorch-inf2', 'pythia-2.8b') as r: - prepare.build_transformers_neuronx_handler_model("pythia-2.8b") - r.launch(container='pytorch-inf2-2') - client.run("transformers_neuronx pythia-2.8b".split()) - - def test_bloom(self): - with Runner('pytorch-inf2', 'bloom-7b1') as r: - prepare.build_transformers_neuronx_handler_model("bloom-7b1") - r.launch(container='pytorch-inf2-2') - client.run("transformers_neuronx bloom-7b1".split()) - @pytest.mark.parametrize("model", ["tiny-llama-rb-aot", "tiny-llama-rb-aot-quant"]) def test_partition(self, model):