Skip to content

Enhance the container family validation for multi-model deployment #1148

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions ads/aqua/common/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# Copyright (c) 2024, 2025 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from typing import Dict, List

from ads.common.extended_enum import ExtendedEnum


Expand Down Expand Up @@ -106,3 +108,15 @@ class ModelFormat(ExtendedEnum):
class Platform(ExtendedEnum):
ARM_CPU = "ARM_CPU"
NVIDIA_GPU = "NVIDIA_GPU"


# This dictionary defines compatibility groups for container families.
# The structure is:
# - Key: The preferred container family to use when multiple compatible families are selected.
# - Value: A list of all compatible families (including the preferred one).
CONTAINER_FAMILY_COMPATIBILITY: Dict[str, List[str]] = {
Copy link
Member

@kumar-shivam-ranjan kumar-shivam-ranjan Apr 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there specific reason why we have chosen odsc-vllm-v1 as key and not odsc-vllm?
If i understand correctly , if 2 or more models are chosen with some models compatible with odsc-vllm-v1 and others with odsc-vllm, the group will be deployed with odsc-vllm-v1. and if all selected models are compatible with odsc-vllm , we still go ahead and deploy with odsc-vllm-v1?
Correct me if am wrong. @mrDzurb

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see.
I believe odsc-vllm-v1 is preferred in both the cases.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no perfect solution for this, in my opinion. Ideally, we would re-test all service models and update them to use the latest container, but that would be too time consuming. For now, this is just a best-effort attempt to choose the most recent container family when models from different families are mixed. Hopefully, VLLM will continue to improve, and the enhancement introduced in this PR will be more robust.

InferenceContainerTypeFamily.AQUA_VLLM_V1_CONTAINER_FAMILY: [
InferenceContainerTypeFamily.AQUA_VLLM_V1_CONTAINER_FAMILY,
InferenceContainerTypeFamily.AQUA_VLLM_CONTAINER_FAMILY,
],
}
38 changes: 38 additions & 0 deletions ads/aqua/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

from ads.aqua.common.entities import GPUShapesIndex
from ads.aqua.common.enums import (
CONTAINER_FAMILY_COMPATIBILITY,
InferenceContainerParamType,
InferenceContainerType,
RqsAdditionalDetails,
Expand Down Expand Up @@ -1316,3 +1317,40 @@ def load_gpu_shapes_index(
)

return GPUShapesIndex(**data)


def get_preferred_compatible_family(selected_families: set[str]) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: use -> Optional[str] instead of str.

"""
Determines the preferred container family from a given set of container families.

This method is used in the context of multi-model deployment to handle cases
where models selected for deployment use different, but compatible, container families.

It checks the input `families` set against the `CONTAINER_FAMILY_COMPATIBILITY` map.
If a compatibility group exists that fully includes all the families in the input,
the corresponding key (i.e., the preferred family) is returned.

Parameters
----------
families : set[str]
A set of container family identifiers.

Returns
-------
Optional[str]
The preferred container family if all families are compatible within one group;
otherwise, returns `None` indicating that no compatible family group was found.

Example
-------
>>> get_preferred_compatible_family({"odsc-vllm-serving", "odsc-vllm-serving-v1"})
'odsc-vllm-serving-v1'

>>> get_preferred_compatible_family({"odsc-vllm-serving", "odsc-tgi-serving"})
None # Incompatible families
"""
for preferred, compatible_list in CONTAINER_FAMILY_COMPATIBILITY.items():
if selected_families.issubset(set(compatible_list)):
return preferred

return None
23 changes: 17 additions & 6 deletions ads/aqua/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
get_artifact_path,
get_container_config,
get_hf_model_info,
get_preferred_compatible_family,
list_os_files_with_extension,
load_config,
read_file,
Expand Down Expand Up @@ -337,15 +338,25 @@ def create_multi(

selected_models_deployment_containers.add(deployment_container)

# Check if the all models in the group shares same container family
if len(selected_models_deployment_containers) > 1:
if not selected_models_deployment_containers:
raise AquaValueError(
"The selected models are associated with different container families: "
f"{list(selected_models_deployment_containers)}."
"For multi-model deployment, all models in the group must share the same container family."
"None of the selected models are associated with a recognized container family. "
"Please review the selected models, or select a different group of models."
)

deployment_container = selected_models_deployment_containers.pop()
# Check if the all models in the group shares same container family
if len(selected_models_deployment_containers) > 1:
deployment_container = get_preferred_compatible_family(
selected_families=selected_models_deployment_containers
)
if not deployment_container:
raise AquaValueError(
"The selected models are associated with different container families: "
f"{list(selected_models_deployment_containers)}."
"For multi-model deployment, all models in the group must share the same container family."
)
else:
deployment_container = selected_models_deployment_containers.pop()

# Generate model group details
timestamp = datetime.now().strftime("%Y%m%d")
Expand Down
24 changes: 24 additions & 0 deletions tests/unitary/with_extras/aqua/test_common_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2025 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import pytest
from ads.aqua.common.utils import get_preferred_compatible_family


class TestCommonUtils:
@pytest.mark.parametrize(
"input_families, expected",
[
(
{"odsc-vllm-serving", "odsc-vllm-serving-v1"},
"odsc-vllm-serving-v1",
),
({"odsc-tgi-serving", "odsc-vllm-serving"}, None),
({"non-existing-one", "odsc-tgi-serving"}, None),
],
)
def test_get_preferred_compatible_family(self, input_families, expected):
assert get_preferred_compatible_family(input_families) == expected