Enhance the validation of evaluate API input parameters (Azure#37965)

ninghu · web-flow · commit 4f0349edac4e · 2024-10-18T09:32:35.000-07:00
* initial changes

* update

* try to fix the changelog

* revert the changelog change

* fix the test

* address pylint errors

* update changelog

* update
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -6,11 +6,13 @@
 ### Features Added
 
 ### Breaking Changes
+- Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
 
 ### Bugs Fixed
 - Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
 
 ### Other Changes
+- Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
 
 ## 1.0.0b4 (2024-10-16)
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py
@@ -10,7 +10,7 @@
 from typing_extensions import NotRequired, Required, TypeGuard
 
 from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
-from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, EvaluationException
 from azure.ai.evaluation._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -122,38 +122,34 @@ def validate_azure_ai_project(o: object) -> AzureAIProject:
     fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
 
     if not isinstance(o, dict):
-        msg = "azure_ai_project must be a dictionary"
+        msg = "The 'azure_ai_project' parameter must be a dictionary."
         raise EvaluationException(
             message=msg,
-            internal_message=msg,
-            target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
-            category=ErrorCategory.MISSING_FIELD,
+            category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
         )
 
     missing_fields = set(fields.keys()) - o.keys()
 
     if missing_fields:
-        msg = "azure_ai_project must contain keys: " + ", ".join(f'"{field}"' for field in missing_fields)
+        msg = (
+            "The 'azure_ai_project' dictionary is missing the following required "
+            f"field(s): {', '.join(f'{field}' for field in missing_fields)}."
+        )
         raise EvaluationException(
             message=msg,
-            internal_message=msg,
-            target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
-            category=ErrorCategory.MISSING_FIELD,
+            category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
         )
 
     for field_name, expected_type in fields.items():
         if isinstance(o[field_name], expected_type):
             continue
 
-        msg = f"Expected azure_ai_project field {field_name!r} to be of type {expected_type}."
-
+        msg = f"Invalid type for field '{field_name}'. Expected {expected_type}, but got {type(o[field_name])}."
         raise EvaluationException(
-            message=f"{msg}. Got {type(o[field_name])}.",
-            internal_message=msg,
-            target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
-            category=ErrorCategory.MISSING_FIELD,
+            message=msg,
+            category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
         )
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py
@@ -37,7 +37,7 @@ def run(
         **kwargs
     ) -> ProxyRun:
         flow_to_run = flow
-        if hasattr(flow, "_to_async"):
+        if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
             flow_to_run = flow._to_async()  # pylint: disable=protected-access
 
         batch_use_async = self._should_batch_use_async(flow_to_run)
@@ -77,7 +77,7 @@ def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
 
     @staticmethod
     def _should_batch_use_async(flow):
-        if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
+        if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
             if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
                 return True
             if inspect.iscoroutinefunction(flow):
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -15,6 +15,7 @@
 
 from azure.ai.evaluation._common.math import list_sum
 from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
+from azure.ai.evaluation._common.utils import validate_azure_ai_project
 
 from .._constants import (
     CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
@@ -292,76 +293,85 @@ def _validate_columns_for_evaluators(
 
 def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
     if data is None:
-        msg = "data parameter must be provided for evaluation."
+        msg = "The 'data' parameter is required for evaluation."
         raise EvaluationException(
             message=msg,
-            internal_message=msg,
             target=ErrorTarget.EVALUATE,
-            category=ErrorCategory.MISSING_FIELD,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    if not isinstance(data, (os.PathLike, str)):
+        msg = "The 'data' parameter must be a string or a path-like object."
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    if not os.path.exists(data):
+        msg = f"The input data file path '{data}' does not exist."
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
         )
 
     if target is not None:
         if not callable(target):
-            msg = "target parameter must be a callable function."
+            msg = "The 'target' parameter must be a callable function."
             raise EvaluationException(
                 message=msg,
-                internal_message=msg,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.INVALID_VALUE,
                 blame=ErrorBlame.USER_ERROR,
             )
 
-    if data is not None:
-        if not isinstance(data, str):
-            msg = "data parameter must be a string."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                target=ErrorTarget.EVALUATE,
-                category=ErrorCategory.INVALID_VALUE,
-                blame=ErrorBlame.USER_ERROR,
-            )
+    if not evaluators:
+        msg = "The 'evaluators' parameter is required and cannot be None or empty."
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
+    if not isinstance(evaluators, dict):
+        msg = "The 'evaluators' parameter must be a dictionary."
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.INVALID_VALUE,
+            blame=ErrorBlame.USER_ERROR,
+        )
 
-    if evaluators is not None:
-        if not isinstance(evaluators, dict):
-            msg = "evaluators parameter must be a dictionary."
+    if output_path is not None:
+        if not isinstance(output_path, (os.PathLike, str)):
+            msg = "The 'output_path' parameter must be a string or a path-like object."
             raise EvaluationException(
                 message=msg,
-                internal_message=msg,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.INVALID_VALUE,
                 blame=ErrorBlame.USER_ERROR,
             )
 
-    if output_path is not None:
-        if not isinstance(output_path, str):
-            msg = "output_path parameter must be a string."
+        output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
             raise EvaluationException(
                 message=msg,
-                internal_message=msg,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.INVALID_VALUE,
                 blame=ErrorBlame.USER_ERROR,
             )
 
     if azure_ai_project is not None:
-        if not isinstance(azure_ai_project, Dict):
-            msg = "azure_ai_project parameter must be a dictionary."
-            raise EvaluationException(
-                message=msg,
-                internal_message=msg,
-                target=ErrorTarget.EVALUATE,
-                category=ErrorCategory.INVALID_VALUE,
-                blame=ErrorBlame.USER_ERROR,
-            )
+        validate_azure_ai_project(azure_ai_project)
 
     if evaluation_name is not None:
-        if not isinstance(evaluation_name, str):
-            msg = "evaluation_name parameter must be a string."
+        if not isinstance(evaluation_name, str) or not evaluation_name.strip():
+            msg = "The 'evaluation_name' parameter must be a non-empty string."
             raise EvaluationException(
                 message=msg,
-                internal_message=msg,
                 target=ErrorTarget.EVALUATE,
                 category=ErrorCategory.INVALID_VALUE,
                 blame=ErrorBlame.USER_ERROR,
@@ -371,8 +381,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
         initial_data_df = pd.read_json(data, lines=True)
     except Exception as e:
         raise EvaluationException(
-            message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
-            internal_message="Failed to load data. Confirm that it is valid jsonl data.",
+            message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
             target=ErrorTarget.EVALUATE,
             category=ErrorCategory.INVALID_VALUE,
             blame=ErrorBlame.USER_ERROR,
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -219,9 +219,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
             singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
         # Check that both conversation and other inputs aren't set
         if conversation is not None and any(singletons.values()):
+            msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
             raise EvaluationException(
-                message="Invalid input",
-                internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
+                message=msg,
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
                 target=ErrorTarget.CONVERSATION,
@@ -233,9 +233,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
         if all(value is not None for value in singletons.values()):
             return [singletons]  # TODO loosen requirements to allow for optional singletons?
         # Missing input
+        msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
         raise EvaluationException(
-            message="Missing input",
-            internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
+            message=msg,
             blame=ErrorBlame.USER_ERROR,
             category=ErrorCategory.INVALID_VALUE,
             target=ErrorTarget.CONVERSATION,
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -207,7 +207,7 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file):
     @pytest.mark.performance_test
     @pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr")
     def test_evaluate_with_async_enabled_evaluator(self, model_config, data_file):
-        os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "true"
+        os.environ["AI_EVALS_BATCH_USE_ASYNC"] = "true"
         fluency_eval = FluencyEvaluator(model_config)
 
         start_time = time.time()
@@ -231,7 +231,7 @@ def test_evaluate_with_async_enabled_evaluator(self, model_config, data_file):
         assert "outputs.fluency.gpt_fluency" in row_result_df.columns.to_list()
         assert "fluency.gpt_fluency" in metrics.keys()
         assert duration < 10, f"evaluate API call took too long: {duration} seconds"
-        os.environ.pop("PF_EVALS_BATCH_USE_ASYNC")
+        os.environ.pop("AI_EVALS_BATCH_USE_ASYNC")
 
     @pytest.mark.parametrize(
         "use_pf_client,function,column",
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py
@@ -38,4 +38,6 @@ def test_fluency_evaluator_empty_string(self, mock_model_config):
         with pytest.raises(EvaluationException) as exc_info:
             fluency_eval(query="What is the capital of Japan?", response=None)
 
-        assert "Missing input" in exc_info.value.args[0]
+        assert (
+            "FluencyEvaluator: Either 'conversation' or individual inputs must be provided." in exc_info.value.args[0]
+        )
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -111,14 +111,14 @@ def _question_answer_override_target(query, response):
 @pytest.mark.usefixtures("mock_model_config")
 @pytest.mark.unittest
 class TestEvaluate:
-    def test_evaluate_evaluators_not_a_dict(self, mock_model_config):
+    def test_evaluate_evaluators_not_a_dict(self, mock_model_config, questions_file):
         with pytest.raises(EvaluationException) as exc_info:
             evaluate(
-                data="data",
+                data=questions_file,
                 evaluators=[GroundednessEvaluator(model_config=mock_model_config)],
             )
 
-        assert "evaluators parameter must be a dictionary." in exc_info.value.args[0]
+        assert "The 'evaluators' parameter must be a dictionary." in exc_info.value.args[0]
 
     def test_evaluate_invalid_data(self, mock_model_config):
         with pytest.raises(EvaluationException) as exc_info:
@@ -127,7 +127,26 @@ def test_evaluate_invalid_data(self, mock_model_config):
                 evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
             )
 
-        assert "data parameter must be a string." in exc_info.value.args[0]
+        assert "The 'data' parameter must be a string or a path-like object." in exc_info.value.args[0]
+
+    def test_evaluate_data_not_exist(self, mock_model_config):
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluate(
+                data="not_exist.jsonl",
+                evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
+            )
+
+        assert "The input data file path 'not_exist.jsonl' does not exist." in exc_info.value.args[0]
+
+    def test_target_not_callable(self, mock_model_config, questions_file):
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluate(
+                data=questions_file,
+                evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
+                target="not_callable",
+            )
+
+        assert "The 'target' parameter must be a callable function." in exc_info.value.args[0]
 
     def test_evaluate_invalid_jsonl_data(self, mock_model_config, invalid_jsonl_file):
         with pytest.raises(EvaluationException) as exc_info:
@@ -136,8 +155,8 @@ def test_evaluate_invalid_jsonl_data(self, mock_model_config, invalid_jsonl_file
                 evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
             )
 
-        assert "Failed to load data from " in exc_info.value.args[0]
-        assert "Confirm that it is valid jsonl data." in exc_info.value.args[0]
+        assert "Unable to load data from " in exc_info.value.args[0]
+        assert "Please ensure the input is valid JSONL format." in exc_info.value.args[0]
 
     def test_evaluate_missing_required_inputs(self, missing_columns_jsonl_file):
         with pytest.raises(EvaluationException) as exc_info:
@@ -367,6 +386,16 @@ def test_renaming_column(self):
         df_actuals = _rename_columns_conditionally(df)
         assert_frame_equal(df_actuals.sort_index(axis=1), df_expected.sort_index(axis=1))
 
+    def test_evaluate_output_dir_not_exist(self, mock_model_config, questions_file):
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluate(
+                data=questions_file,
+                evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
+                output_path="./not_exist_dir/output.jsonl",
+            )
+
+        assert "The output directory './not_exist_dir' does not exist." in exc_info.value.args[0]
+
     @pytest.mark.parametrize("use_pf_client", [True, False])
     def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_pf_client):
         output_path = os.path.join(tmpdir, "eval_test_results.jsonl")