Adding AQUA handler for streaming inference API

kumar-shivam-ranjan · kumar-shivam-ranjan · commit 177f88851cc0 · 2025-05-20T19:11:36.000+05:30
diff --git a/ads/aqua/app.py b/ads/aqua/app.py
@@ -40,6 +40,7 @@
 from ads.config import (
     AQUA_TELEMETRY_BUCKET,
     AQUA_TELEMETRY_BUCKET_NS,
+    OCI_MD_SERVICE_ENDPOINT,
     OCI_ODSC_SERVICE_ENDPOINT,
     OCI_RESOURCE_PRINCIPAL_VERSION,
 )
@@ -63,8 +64,14 @@ def __init__(self) -> None:
         if OCI_RESOURCE_PRINCIPAL_VERSION:
             set_auth("resource_principal")
         self._auth = default_signer({"service_endpoint": OCI_ODSC_SERVICE_ENDPOINT})
+        self._md_auth = default_signer({"service_endpoint": OCI_MD_SERVICE_ENDPOINT})
         self.ds_client = oc.OCIClientFactory(**self._auth).data_science
         self.compute_client = oc.OCIClientFactory(**default_signer()).compute
+        print("self._md_auth: ", self._md_auth)
+        print("OCI_MD_SERVICE_ENDPOINT: ", OCI_MD_SERVICE_ENDPOINT)
+        self.model_deployment_client = oc.OCIClientFactory(
+            **self._md_auth
+        ).model_deployment
         self.logging_client = oc.OCIClientFactory(**default_signer()).logging_management
         self.identity_client = oc.OCIClientFactory(**default_signer()).identity
         self.region = extract_region(self._auth)
diff --git a/ads/aqua/extension/deployment_handler.py b/ads/aqua/extension/deployment_handler.py
@@ -5,6 +5,7 @@
 from typing import List, Union
 from urllib.parse import urlparse
 
+from tornado.iostream import StreamClosedError
 from tornado.web import HTTPError
 
 from ads.aqua.common.decorator import handle_exceptions
@@ -175,21 +176,9 @@ def list_shapes(self):
         )
 
 
-class AquaDeploymentInferenceHandler(AquaAPIhandler):
-    @staticmethod
-    def validate_predict_url(endpoint):
-        try:
-            url = urlparse(endpoint)
-            if url.scheme != "https":
-                return False
-            if not url.netloc:
-                return False
-            return url.path.endswith("/predict")
-        except Exception:
-            return False
-
+class AquaDeploymentStreamingInferenceHandler(AquaAPIhandler):
     @handle_exceptions
-    def post(self, *args, **kwargs):  # noqa: ARG002
+    async def post(self, *args, **kwargs):  # noqa: ARG002
         """
         Handles inference request for the Active Model Deployments
         Raises
@@ -205,12 +194,7 @@ def post(self, *args, **kwargs):  # noqa: ARG002
         if not input_data:
             raise HTTPError(400, Errors.NO_INPUT_DATA)
 
-        endpoint = input_data.get("endpoint")
-        if not endpoint:
-            raise HTTPError(400, Errors.MISSING_REQUIRED_PARAMETER.format("endpoint"))
-
-        if not self.validate_predict_url(endpoint):
-            raise HTTPError(400, Errors.INVALID_INPUT_DATA_FORMAT.format("endpoint"))
+        model_deployment_id = input_data.get("id")
 
         prompt = input_data.get("prompt")
         if not prompt:
@@ -226,11 +210,24 @@ def post(self, *args, **kwargs):  # noqa: ARG002
                 400, Errors.INVALID_INPUT_DATA_FORMAT.format("model_params")
             ) from ex
 
-        return self.finish(
-            MDInferenceResponse(prompt, model_params_obj).get_model_deployment_response(
-                endpoint
-            )
-        )
+        self.set_header("Content-Type", "text/event-stream")
+        self.set_header("Cache-Control", "no-cache")
+        self.set_header("Transfer-Encoding", "chunked")
+        await self.flush()
+
+        try:
+            response_gen = MDInferenceResponse(
+                prompt, model_params_obj
+            ).get_model_deployment_response(model_deployment_id)
+            for chunk in response_gen:
+                if not chunk:
+                    continue
+                self.write(f"data: {chunk}\n\n")
+                await self.flush()
+        except StreamClosedError:
+            self.log.warning("Client disconnected.")
+        finally:
+            self.finish()
 
 
 class AquaDeploymentParamsHandler(AquaAPIhandler):
@@ -294,5 +291,5 @@ def post(self, *args, **kwargs):  # noqa: ARG002
     ("deployments/?([^/]*)", AquaDeploymentHandler),
     ("deployments/?([^/]*)/activate", AquaDeploymentHandler),
     ("deployments/?([^/]*)/deactivate", AquaDeploymentHandler),
-    ("inference", AquaDeploymentInferenceHandler),
+    ("inference", AquaDeploymentStreamingInferenceHandler),
 ]
diff --git a/ads/aqua/modeldeployment/inference.py b/ads/aqua/modeldeployment/inference.py
@@ -4,17 +4,12 @@
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 import json
-from dataclasses import dataclass, field
-
-import requests
 
 from ads.aqua.app import AquaApp
 from ads.aqua.modeldeployment.entities import ModelParams
-from ads.common.auth import default_signer
 from ads.telemetry import telemetry
 
 
-@dataclass
 class MDInferenceResponse(AquaApp):
     """Contains APIs for Aqua Model deployments Inference.
 
@@ -30,11 +25,32 @@ class MDInferenceResponse(AquaApp):
         Creates an instance of model deployment via Aqua
     """
 
-    prompt: str = None
-    model_params: field(default_factory=ModelParams) = None
+    def __init__(self, prompt=None, model_params=None):
+        super().__init__()
+        self.prompt = prompt
+        self.model_params = model_params or ModelParams()
+
+    @staticmethod
+    def stream_sanitizer(response):
+        for chunk in response.data.raw.stream(1024 * 1024, decode_content=True):
+            if not chunk:
+                continue
+
+            try:
+                decoded = chunk.decode("utf-8").strip()
+                if not decoded.startswith("data:"):
+                    continue
+
+                data_json = decoded[len("data:") :].strip()
+                parsed = json.loads(data_json)
+                text = parsed["choices"][0]["text"]
+                yield text
+
+            except Exception:
+                continue
 
     @telemetry(entry_point="plugin=inference&action=get_response", name="aqua")
-    def get_model_deployment_response(self, endpoint):
+    def get_model_deployment_response(self, model_deployment_id):
         """
         Returns MD inference response
 
@@ -67,8 +83,9 @@ def get_model_deployment_response(self, endpoint):
             key: value for key, value in params_dict.items() if value is not None
         }
         body = {"prompt": self.prompt, **params_dict}
-        request_kwargs = {"json": body, "headers": {"Content-Type": "application/json"}}
-        response = requests.post(
-            endpoint, auth=default_signer()["signer"], **request_kwargs
+        response = self.model_deployment_client.predict_with_response_stream(
+            model_deployment_id=model_deployment_id, request_body=body
         )
-        return json.loads(response.content)
+
+        for chunk in MDInferenceResponse.stream_sanitizer(response):
+            yield chunk
diff --git a/ads/common/oci_client.py b/ads/common/oci_client.py
@@ -18,6 +18,7 @@
 from oci.limits import LimitsClient
 from oci.logging import LoggingManagementClient
 from oci.marketplace import MarketplaceClient
+from oci.model_deployment import ModelDeploymentClient
 from oci.object_storage import ObjectStorageClient
 from oci.resource_search import ResourceSearchClient
 from oci.secrets import SecretsClient
@@ -69,6 +70,7 @@ def _client_impl(client):
             "vault": VaultsClient,
             "identity": IdentityClient,
             "compute": ComputeClient,
+            "model_deployment": ModelDeploymentClient,
             "ai_language": AIServiceLanguageClient,
             "data_labeling_dp": DataLabelingClient,
             "data_labeling_cp": DataLabelingManagementClient,
@@ -114,6 +116,10 @@ def create_client(self, client_name):
     def object_storage(self):
         return self.create_client("object_storage")
 
+    @property
+    def model_deployment(self):
+        return self.create_client("model_deployment")
+
     @property
     def compute(self):
         return self.create_client("compute")
diff --git a/ads/config.py b/ads/config.py
@@ -11,6 +11,7 @@
 from ads.common.config import DEFAULT_CONFIG_PATH, DEFAULT_CONFIG_PROFILE, Config, Mode
 
 OCI_ODSC_SERVICE_ENDPOINT = os.environ.get("OCI_ODSC_SERVICE_ENDPOINT")
+OCI_MD_SERVICE_ENDPOINT = os.environ.get("OCI_MD_SERVICE_ENDPOINT")
 OCI_IDENTITY_SERVICE_ENDPOINT = os.environ.get("OCI_IDENTITY_SERVICE_ENDPOINT")
 NB_SESSION_COMPARTMENT_OCID = os.environ.get("NB_SESSION_COMPARTMENT_OCID")
 PROJECT_OCID = os.environ.get("PROJECT_OCID") or os.environ.get("PIPELINE_PROJECT_OCID")