Skip to content

Odsc 70841 update md tracking #1193

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ads/aqua/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@
SERVICE_MANAGED_CONTAINER_URI_SCHEME = "dsmc://"
SUPPORTED_FILE_FORMATS = ["jsonl"]
MODEL_BY_REFERENCE_OSS_PATH_KEY = "artifact_location"
DEFAULT_WAIT_TIME = 1200
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since these constants are specific to model deployment status, we could have it in the ads.aqua.modeldeployment.constants.py.

DEFAULT_POLL_INTERVAL = 10

CONSOLE_LINK_RESOURCE_TYPE_MAPPING = {
"datasciencemodel": "models",
Expand Down
47 changes: 45 additions & 2 deletions ads/aqua/modeldeployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from typing import Dict, List, Optional, Union

from cachetools import TTLCache, cached
import concurrent
from ads.common.work_request import DataScienceWorkRequest
from oci.data_science.models import ModelDeploymentShapeSummary
from pydantic import ValidationError

Expand Down Expand Up @@ -43,6 +45,8 @@
MODEL_BY_REFERENCE_OSS_PATH_KEY,
MODEL_NAME_DELIMITER,
UNKNOWN_DICT,
DEFAULT_WAIT_TIME,
DEFAULT_POLL_INTERVAL
)
from ads.aqua.data import AquaResourceIdentifier
from ads.aqua.model import AquaModelApp
Expand Down Expand Up @@ -80,6 +84,9 @@
from ads.model.model_metadata import ModelCustomMetadataItem
from ads.telemetry import telemetry

THREAD_POOL_SIZE = 16
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can move this constant in ads.aqua.modeldeployment.constants.py

thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=THREAD_POOL_SIZE)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should use the common telemetry or the common decorator threadpool instead of creating one here.



class AquaDeploymentApp(AquaApp):
"""Provides a suite of APIs to interact with Aqua model deployments within the Oracle
Expand Down Expand Up @@ -780,11 +787,18 @@ def _create_deployment(
.with_runtime(container_runtime)
).deploy(wait_for_completion=False)

deployment_id = deployment.id


deployment_id = deployment.id()
Copy link
Member

@VipulMascarenhas VipulMascarenhas May 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id is a property, keep it as .id instead else it will result in TypeError

logger.info(
f"Aqua model deployment {deployment_id} created for model {aqua_model_id}."
)

thread_pool.submit(self.get_deployment_status,
model_deployment_id=deployment_id,
work_request_id=deployment.dsc_model_deployment.workflow_req_id,
model_type=model_type)

# we arbitrarily choose last 8 characters of OCID to identify MD in telemetry
telemetry_kwargs = {"ocid": get_ocid_substring(deployment_id, key_len=8)}

Expand Down Expand Up @@ -1309,4 +1323,33 @@ def list_shapes(self, **kwargs) -> List[ComputeShapeSummary]:
or gpu_specs.shapes.get(oci_shape.name.upper()),
)
for oci_shape in oci_shapes
]
]


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add docstrings

def get_deployment_status(self,model_deployment_id: str, work_request_id : str, model_type : str) :

telemetry_kwargs = {"ocid": get_ocid_substring(model_deployment_id, key_len=8)}

try:
DataScienceWorkRequest(work_request_id).wait_work_request(
progress_bar_description="Creating model deployment",
max_wait_time=DEFAULT_WAIT_TIME,
poll_interval=DEFAULT_POLL_INTERVAL
)
except Exception as e:
logger.error(
"Error while trying to create model deployment: " + str(e)
)
self.telemetry.record_event_async(
category=f"aqua/{model_type}/deployment/status",
action="FAILED",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should log the error message (_error_message) coming from work request here instead of a static message. This will be used to track the specific reasons why the deployment failed.

detail="Error creating model deployment"
**telemetry_kwargs
)

Copy link
Member

@VipulMascarenhas VipulMascarenhas May 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't this be within else block? We can use try-except-else here.

self.telemetry.record_event_async(
category=f"aqua/{model_type}/deployment/status",
action="SUCCEEDED",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can skip this detail, action "SUCCEEDED" implies the same thing.

detail=" Create model deployment successful",
**telemetry_kwargs
)
3 changes: 3 additions & 0 deletions ads/common/work_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(
config: dict = None,
signer: Signer = None,
client_kwargs: dict = None,
_error_message: str = None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need a parameter called _error_message here? We can have it inside init directly:

self._error_message = None

**kwargs
) -> None:
"""Initializes ADSWorkRequest object.
Expand Down Expand Up @@ -65,6 +66,7 @@ def __init__(
self._description = description
self._percentage = 0
self._status = None
_error_message = _error_message
super().__init__(config, signer, client_kwargs, **kwargs)


Expand All @@ -78,6 +80,7 @@ def _sync(self):
self._percentage= work_request.percent_complete
self._status = work_request.status
self._description = work_request_logs[-1].message if work_request_logs else "Processing"
if work_request.status == 'FAILED' : self._error_message = self.client.list_work_request_errors

def watch(
self,
Expand Down
4 changes: 2 additions & 2 deletions ads/model/service/oci_datascience_model_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,9 @@ def activate(
self.id,
)


self.workflow_req_id = response.headers.get("opc-work-request-id", None)
if wait_for_completion:
self.workflow_req_id = response.headers.get("opc-work-request-id", None)

try:
DataScienceWorkRequest(self.workflow_req_id).wait_work_request(
progress_bar_description="Activating model deployment",
Expand Down
2 changes: 1 addition & 1 deletion ads/telemetry/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,4 @@ def record_event_async(
Thread
A started thread to send a head request to generate an event record.
"""
thread_pool.submit(self.record_event, args=(category, action, detail), kwargs=kwargs)
thread_pool.submit(self.record_event, category, action, detail, **kwargs)
Loading