Skip to content

[AQUA Telemetry] Update MD Tracking #1193

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ads/aqua/modeldeployment/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@

This module contains constants used in Aqua Model Deployment.
"""
DEFAULT_WAIT_TIME = 1200
DEFAULT_POLL_INTERVAL = 10
61 changes: 58 additions & 3 deletions ads/aqua/modeldeployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from typing import Dict, List, Optional, Union

from cachetools import TTLCache, cached
from ads.aqua.modeldeployment.constants import DEFAULT_POLL_INTERVAL, DEFAULT_WAIT_TIME
from ads.common.decorator.threaded import threaded
from ads.common.work_request import DataScienceWorkRequest
from oci.data_science.models import ModelDeploymentShapeSummary
from pydantic import ValidationError

Expand Down Expand Up @@ -46,8 +49,9 @@
AQUA_MULTI_MODEL_CONFIG,
MODEL_BY_REFERENCE_OSS_PATH_KEY,
MODEL_NAME_DELIMITER,
UNKNOWN_DICT,
UNKNOWN_DICT
)

from ads.aqua.data import AquaResourceIdentifier
from ads.aqua.model import AquaModelApp
from ads.aqua.model.constants import AquaModelMetadataKeys, ModelCustomMetadataFields
Expand Down Expand Up @@ -81,9 +85,10 @@
ModelDeploymentInfrastructure,
ModelDeploymentMode,
)

from ads.model.model_metadata import ModelCustomMetadataItem
from ads.telemetry import telemetry

from ads.common.decorator.threaded import thread_pool

class AquaDeploymentApp(AquaApp):
"""Provides a suite of APIs to interact with Aqua model deployments within the Oracle
Expand Down Expand Up @@ -788,8 +793,13 @@ def _create_deployment(

deployment_id = deployment.id
logger.info(
f"Aqua model deployment {deployment_id} created for model {aqua_model_id}."
f"Aqua model deployment {deployment_id} created for model {aqua_model_id}. Work request Id is {deployment.dsc_model_deployment.workflow_req_id}"
)

thread_pool.submit( self.get_deployment_status ,
deployment_id,
deployment.dsc_model_deployment.workflow_req_id,
model_type)

# we arbitrarily choose last 8 characters of OCID to identify MD in telemetry
telemetry_kwargs = {"ocid": get_ocid_substring(deployment_id, key_len=8)}
Expand Down Expand Up @@ -1313,3 +1323,48 @@ def list_shapes(self, **kwargs) -> List[ComputeShapeSummary]:
)
for oci_shape in oci_shapes
]

def get_deployment_status(self,model_deployment_id: str, work_request_id : str, model_type : str) -> None:
"""Waits for the data science model deployment to be completed and log its status in telemetry.

Parameters
----------

model_deployment_id: str
The id of the deployed aqua model.
work_request_id: str
The work request Id of the model deployment.
model_type: str
The type of aqua model to be deployed. Allowed values are: `custom`, `service` and `multi_model`.

Returns
-------
AquaDeployment
An Aqua deployment instance.
"""
telemetry_kwargs = {"ocid": get_ocid_substring(model_deployment_id, key_len=8)}

data_science_work_request:DataScienceWorkRequest = DataScienceWorkRequest(work_request_id)

try:
data_science_work_request.wait_work_request(
progress_bar_description="Creating model deployment",
max_wait_time=DEFAULT_WAIT_TIME,
poll_interval=DEFAULT_POLL_INTERVAL
)
except Exception as e:
logger.error(
"Error while trying to create model deployment: " + str(e)
)
self.telemetry.record_event_async(
category=f"aqua/{model_type}/deployment/status",
action="FAILED",
detail=data_science_work_request._error_message
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can _error_message be None for any reason? Might be good to do detail=data_science_work_request._error_message or UNKNOWN to avoid unforeseen issues in telemetry logging.

**telemetry_kwargs
)
else :
self.telemetry.record_event_async(
category=f"aqua/{model_type}/deployment/status",
action="SUCCEEDED",
**telemetry_kwargs
)
4 changes: 3 additions & 1 deletion ads/common/work_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(
description: str = "Processing",
config: dict = None,
signer: Signer = None,
client_kwargs: dict = None,
client_kwargs: dict = None,
**kwargs
) -> None:
"""Initializes ADSWorkRequest object.
Expand Down Expand Up @@ -65,6 +65,7 @@ def __init__(
self._description = description
self._percentage = 0
self._status = None
_error_message = None
super().__init__(config, signer, client_kwargs, **kwargs)


Expand All @@ -78,6 +79,7 @@ def _sync(self):
self._percentage= work_request.percent_complete
self._status = work_request.status
self._description = work_request_logs[-1].message if work_request_logs else "Processing"
if work_request.status == 'FAILED' : self._error_message = self.client.list_work_request_errors
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might be good to show an example output for failed and successful MD in the PR description.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also let's use ruff formatter to format the code.


def watch(
self,
Expand Down
17 changes: 6 additions & 11 deletions ads/model/service/oci_datascience_model_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,9 @@ def activate(
self.id,
)


self.workflow_req_id = response.headers.get("opc-work-request-id", None)
if wait_for_completion:
self.workflow_req_id = response.headers.get("opc-work-request-id", None)

try:
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
progress_bar_description="Activating model deployment",
Expand Down Expand Up @@ -233,11 +233,9 @@ def create(
response = self.client.create_model_deployment(create_model_deployment_details)
self.update_from_oci_model(response.data)
logger.info(f"Creating model deployment `{self.id}`.")
print(f"Model Deployment OCID: {self.id}")

self.workflow_req_id = response.headers.get("opc-work-request-id", None)
if wait_for_completion:
self.workflow_req_id = response.headers.get("opc-work-request-id", None)

try:
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
progress_bar_description="Creating model deployment",
Expand Down Expand Up @@ -287,10 +285,8 @@ def deactivate(
response = self.client.deactivate_model_deployment(
self.id,
)

self.workflow_req_id = response.headers.get("opc-work-request-id", None)
if wait_for_completion:
self.workflow_req_id = response.headers.get("opc-work-request-id", None)

try:
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
progress_bar_description="Deactivating model deployment",
Expand Down Expand Up @@ -355,10 +351,9 @@ def delete(
response = self.client.delete_model_deployment(
self.id,
)


self.workflow_req_id = response.headers.get("opc-work-request-id", None)
if wait_for_completion:
self.workflow_req_id = response.headers.get("opc-work-request-id", None)

try:
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
progress_bar_description="Deleting model deployment",
Expand Down
2 changes: 1 addition & 1 deletion ads/telemetry/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,4 @@ def record_event_async(
Thread
A started thread to send a head request to generate an event record.
"""
thread_pool.submit(self.record_event, args=(category, action, detail), kwargs=kwargs)
thread_pool.submit(self.record_event, category, action, detail, **kwargs)
25 changes: 25 additions & 0 deletions tests/unitary/with_extras/aqua/test_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from importlib import reload
from unittest.mock import MagicMock, patch

from ads.aqua.modeldeployment.constants import DEFAULT_POLL_INTERVAL, DEFAULT_WAIT_TIME
from ads.model.service.oci_datascience_model_deployment import OCIDataScienceModelDeployment
import oci
import pytest
from oci.data_science.models import (
Expand Down Expand Up @@ -2282,3 +2284,26 @@ def test_validate_multimodel_deployment_feasibility_positive_single(
total_gpus,
"test_data/deployment/aqua_summary_multi_model_single.json",
)

def test_get_deployment_status(self) :
deployment_id = "fakeid.datasciencemodeldeployment.oc1.iad.xxx"
work_request_id = "fakeid.workrequest.oc1.iad.xxx"
model_type = "custom"

with patch(
"ads.model.service.oci_datascience_model_deployment.DataScienceWorkRequest.__init__"
) as mock_ds_work_request:
mock_ds_work_request.return_value = None
with patch(
"ads.model.service.oci_datascience_model_deployment.DataScienceWorkRequest.wait_work_request"
) as mock_wait:
self.app.get_deployment_status(
deployment_id, work_request_id, model_type
)

mock_ds_work_request.assert_called_with("test")
mock_wait.assert_called_with(
progress_bar_description='Creating model deployment',
max_wait_time=DEFAULT_WAIT_TIME,
poll_interval=DEFAULT_POLL_INTERVAL
)
Loading