Updating payload format

kumar-shivam-ranjan · kumar-shivam-ranjan · commit d1d942d09e97 · 2025-05-20T20:23:39.000+05:30
diff --git a/ads/aqua/app.py b/ads/aqua/app.py
@@ -67,8 +67,6 @@ def __init__(self) -> None:
         self._md_auth = default_signer({"service_endpoint": OCI_MD_SERVICE_ENDPOINT})
         self.ds_client = oc.OCIClientFactory(**self._auth).data_science
         self.compute_client = oc.OCIClientFactory(**default_signer()).compute
-        print("self._md_auth: ", self._md_auth)
-        print("OCI_MD_SERVICE_ENDPOINT: ", OCI_MD_SERVICE_ENDPOINT)
         self.model_deployment_client = oc.OCIClientFactory(
             **self._md_auth
         ).model_deployment
diff --git a/ads/aqua/extension/deployment_handler.py b/ads/aqua/extension/deployment_handler.py
@@ -11,8 +11,7 @@
 from ads.aqua.common.decorator import handle_exceptions
 from ads.aqua.extension.base_handler import AquaAPIhandler
 from ads.aqua.extension.errors import Errors
-from ads.aqua.modeldeployment import AquaDeploymentApp, MDInferenceResponse
-from ads.aqua.modeldeployment.entities import ModelParams
+from ads.aqua.modeldeployment import AquaDeploymentApp
 from ads.config import COMPARTMENT_OCID
 
 
@@ -178,9 +177,9 @@ def list_shapes(self):
 
 class AquaDeploymentStreamingInferenceHandler(AquaAPIhandler):
     @handle_exceptions
-    async def post(self, *args, **kwargs):  # noqa: ARG002
+    async def post(self, model_deployment_id):
         """
-        Handles inference request for the Active Model Deployments
+        Handles streaming inference request for the Active Model Deployments
         Raises
         ------
         HTTPError
@@ -194,38 +193,34 @@ async def post(self, *args, **kwargs):  # noqa: ARG002
         if not input_data:
             raise HTTPError(400, Errors.NO_INPUT_DATA)
 
-        model_deployment_id = input_data.get("id")
-
         prompt = input_data.get("prompt")
-        if not prompt:
-            raise HTTPError(400, Errors.MISSING_REQUIRED_PARAMETER.format("prompt"))
+        messages = input_data.get("messages")
 
-        model_params = (
-            input_data.get("model_params") if input_data.get("model_params") else {}
-        )
-        try:
-            model_params_obj = ModelParams(**model_params)
-        except Exception as ex:
+        if not prompt and not messages:
             raise HTTPError(
-                400, Errors.INVALID_INPUT_DATA_FORMAT.format("model_params")
-            ) from ex
+                400, Errors.MISSING_REQUIRED_PARAMETER.format("prompt/messages")
+            )
+        if not input_data.get("model"):
+            raise HTTPError(400, Errors.MISSING_REQUIRED_PARAMETER.format("model"))
+
+        if "stream" not in input_data:
+            input_data.update(stream=True)
 
         self.set_header("Content-Type", "text/event-stream")
         self.set_header("Cache-Control", "no-cache")
         self.set_header("Transfer-Encoding", "chunked")
         await self.flush()
-
         try:
-            response_gen = MDInferenceResponse(
-                prompt, model_params_obj
-            ).get_model_deployment_response(model_deployment_id)
+            response_gen = AquaDeploymentApp().get_model_deployment_response(
+                model_deployment_id, input_data
+            )
             for chunk in response_gen:
                 if not chunk:
                     continue
                 self.write(f"data: {chunk}\n\n")
                 await self.flush()
-        except StreamClosedError:
-            self.log.warning("Client disconnected.")
+        except StreamClosedError as ex:
+            raise HTTPError(500, str(ex)) from ex
         finally:
             self.finish()
 
@@ -291,5 +286,5 @@ def post(self, *args, **kwargs):  # noqa: ARG002
     ("deployments/?([^/]*)", AquaDeploymentHandler),
     ("deployments/?([^/]*)/activate", AquaDeploymentHandler),
     ("deployments/?([^/]*)/deactivate", AquaDeploymentHandler),
-    ("inference", AquaDeploymentStreamingInferenceHandler),
+    ("inference/stream/?([^/]*)", AquaDeploymentStreamingInferenceHandler),
 ]
diff --git a/ads/aqua/modeldeployment/__init__.py b/ads/aqua/modeldeployment/__init__.py
@@ -1,8 +1,6 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
-# Copyright (c) 2024 Oracle and/or its affiliates.
+# Copyright (c) 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 from ads.aqua.modeldeployment.deployment import AquaDeploymentApp
-from ads.aqua.modeldeployment.inference import MDInferenceResponse
 
-__all__ = ["AquaDeploymentApp", "MDInferenceResponse"]
+__all__ = ["AquaDeploymentApp"]
diff --git a/ads/aqua/modeldeployment/deployment.py b/ads/aqua/modeldeployment/deployment.py
@@ -628,7 +628,9 @@ def _create_multi(
                 config_data["model_task"] = model.model_task
 
             if model.fine_tune_weights_location:
-                config_data["fine_tune_weights_location"] = model.fine_tune_weights_location
+                config_data["fine_tune_weights_location"] = (
+                    model.fine_tune_weights_location
+                )
 
             model_config.append(config_data)
             model_name_list.append(model.model_name)
@@ -789,7 +791,7 @@ def _create_deployment(
         telemetry_kwargs = {"ocid": get_ocid_substring(deployment_id, key_len=8)}
 
         if Tags.BASE_MODEL_CUSTOM in tags:
-            telemetry_kwargs[ "custom_base_model"] = True
+            telemetry_kwargs["custom_base_model"] = True
 
         # tracks unique deployments that were created in the user compartment
         self.telemetry.record_event_async(
@@ -1309,4 +1311,57 @@ def list_shapes(self, **kwargs) -> List[ComputeShapeSummary]:
                 or gpu_specs.shapes.get(oci_shape.name.upper()),
             )
             for oci_shape in oci_shapes
-        ]
+        ]
+
+    @staticmethod
+    def _stream_sanitizer(response):
+        for chunk in response.data.raw.stream(1024 * 1024, decode_content=True):
+            if not chunk:
+                continue
+
+            try:
+                decoded = chunk.decode("utf-8").strip()
+                if not decoded.startswith("data:"):
+                    continue
+
+                data_json = decoded[len("data:") :].strip()
+                parsed = json.loads(data_json)
+                text = parsed["choices"][0]["text"]
+                yield text
+
+            except Exception:
+                continue
+
+    @telemetry(entry_point="plugin=inference&action=get_response", name="aqua")
+    def get_model_deployment_response(self, model_deployment_id: str, payload: dict):
+        """
+        Returns Model deployment inference response in streaming fashion
+
+        Parameters
+        ----------
+        model_deployment_id: str
+            Model deployment ocid
+        payload: dict
+            model params.
+                {
+                "max_tokens": 1024,
+                "temperature": 0.5,
+                "prompt": "what are some good skills deep learning expert. Give us some tips on how to structure interview with some coding example?",
+                "top_p": 0.4,
+                "top_k": 100,
+                "model": "odsc-llm",
+                "frequency_penalty": 1,
+                "presence_penalty": 1,
+                "stream": true
+                }
+
+        Returns
+        -------
+        Model deployment inference response in streaming fashion
+
+        """
+
+        response = self.model_deployment_client.predict_with_response_stream(
+            model_deployment_id=model_deployment_id, request_body=payload
+        )
+        yield from self._stream_sanitizer(response)
diff --git a/tests/unitary/with_extras/aqua/test_deployment.py b/tests/unitary/with_extras/aqua/test_deployment.py
@@ -487,7 +487,7 @@ class TestDataset:
                 "model_name": "test_model_1",
                 "model_task": "text_embedding",
                 "artifact_location": "test_location_1",
-                "fine_tune_weights_location" : None
+                "fine_tune_weights_location": None,
             },
             {
                 "env_var": {},
@@ -496,7 +496,7 @@ class TestDataset:
                 "model_name": "test_model_2",
                 "model_task": "image_text_to_text",
                 "artifact_location": "test_location_2",
-                "fine_tune_weights_location" : None
+                "fine_tune_weights_location": None,
             },
             {
                 "env_var": {},
@@ -505,7 +505,7 @@ class TestDataset:
                 "model_name": "test_model_3",
                 "model_task": "code_synthesis",
                 "artifact_location": "test_location_3",
-                "fine_tune_weights_location" : "oci://test_bucket@test_namespace/models/ft-models/meta-llama-3b/ocid1.datasciencejob.oc1.iad.<ocid>"
+                "fine_tune_weights_location": "oci://test_bucket@test_namespace/models/ft-models/meta-llama-3b/ocid1.datasciencejob.oc1.iad.<ocid>",
             },
         ],
         "model_id": "ocid1.datasciencemodel.oc1.<region>.<OCID>",
@@ -972,7 +972,7 @@ class TestDataset:
             "model_name": "model_one",
             "model_task": "text_embedding",
             "artifact_location": "artifact_location_one",
-            "fine_tune_weights_location": None
+            "fine_tune_weights_location": None,
         },
         {
             "env_var": {"--test_key_two": "test_value_two"},
@@ -981,7 +981,7 @@ class TestDataset:
             "model_name": "model_two",
             "model_task": "image_text_to_text",
             "artifact_location": "artifact_location_two",
-            "fine_tune_weights_location": None
+            "fine_tune_weights_location": None,
         },
         {
             "env_var": {"--test_key_three": "test_value_three"},
@@ -990,7 +990,7 @@ class TestDataset:
             "model_name": "model_three",
             "model_task": "code_synthesis",
             "artifact_location": "artifact_location_three",
-            "fine_tune_weights_location" : "oci://test_bucket@test_namespace/models/ft-models/meta-llama-3b/ocid1.datasciencejob.oc1.iad.<ocid>"
+            "fine_tune_weights_location": "oci://test_bucket@test_namespace/models/ft-models/meta-llama-3b/ocid1.datasciencejob.oc1.iad.<ocid>",
         },
     ]
 
@@ -1817,7 +1817,7 @@ def test_create_deployment_for_multi_model(
             model_task="code_synthesis",
             gpu_count=2,
             artifact_location="test_location_3",
-            fine_tune_weights_location= "oci://test_bucket@test_namespace/models/ft-models/meta-llama-3b/ocid1.datasciencejob.oc1.iad.<ocid>"
+            fine_tune_weights_location="oci://test_bucket@test_namespace/models/ft-models/meta-llama-3b/ocid1.datasciencejob.oc1.iad.<ocid>",
         )
 
         result = self.app.create(
@@ -2283,36 +2283,3 @@ def test_validate_multimodel_deployment_feasibility_positive_single(
             total_gpus,
             "test_data/deployment/aqua_summary_multi_model_single.json",
         )
-
-
-class TestMDInferenceResponse(unittest.TestCase):
-    def setUp(self):
-        self.app = MDInferenceResponse()
-
-    @classmethod
-    def setUpClass(cls):
-        cls.curr_dir = os.path.dirname(os.path.abspath(__file__))
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.curr_dir = None
-
-    @patch("requests.post")
-    def test_get_model_deployment_response(self, mock_post):
-        """Test to check if model deployment response is returned correctly."""
-
-        endpoint = TestDataset.MODEL_DEPLOYMENT_URL + "/predict"
-        self.app.prompt = "What is 1+1?"
-        self.app.model_params = ModelParams(**TestDataset.model_params)
-
-        mock_response = MagicMock()
-        response_json = os.path.join(
-            self.curr_dir, "test_data/deployment/aqua_deployment_response.json"
-        )
-        with open(response_json, "r") as _file:
-            mock_response.content = _file.read()
-        mock_response.status_code = 200
-        mock_post.return_value = mock_response
-
-        result = self.app.get_model_deployment_response(endpoint)
-        assert result["choices"][0]["text"] == " The answer is 2"