Skip to content

Commit 4f0349e

Browse files
authored
Enhance the validation of evaluate API input parameters (Azure#37965)
* initial changes * update * try to fix the changelog * revert the changelog change * fix the test * address pylint errors * update changelog * update
1 parent 19688f9 commit 4f0349e

File tree

8 files changed

+105
-67
lines changed

8 files changed

+105
-67
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@
66
### Features Added
77

88
### Breaking Changes
9+
- Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`.
910

1011
### Bugs Fixed
1112
- Non adversarial simulator works with `gpt-4o` models using the `json_schema` response format
1213

1314
### Other Changes
15+
- Improved error messages for the `evaluate` API by enhancing the validation of input parameters. This update provides more detailed and actionable error descriptions.
1416

1517
## 1.0.0b4 (2024-10-16)
1618

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py

+11-15
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from typing_extensions import NotRequired, Required, TypeGuard
1111

1212
from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE
13-
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
13+
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, EvaluationException
1414
from azure.ai.evaluation._model_configurations import (
1515
AzureAIProject,
1616
AzureOpenAIModelConfiguration,
@@ -122,38 +122,34 @@ def validate_azure_ai_project(o: object) -> AzureAIProject:
122122
fields = {"subscription_id": str, "resource_group_name": str, "project_name": str}
123123

124124
if not isinstance(o, dict):
125-
msg = "azure_ai_project must be a dictionary"
125+
msg = "The 'azure_ai_project' parameter must be a dictionary."
126126
raise EvaluationException(
127127
message=msg,
128-
internal_message=msg,
129-
target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
130-
category=ErrorCategory.MISSING_FIELD,
128+
category=ErrorCategory.INVALID_VALUE,
131129
blame=ErrorBlame.USER_ERROR,
132130
)
133131

134132
missing_fields = set(fields.keys()) - o.keys()
135133

136134
if missing_fields:
137-
msg = "azure_ai_project must contain keys: " + ", ".join(f'"{field}"' for field in missing_fields)
135+
msg = (
136+
"The 'azure_ai_project' dictionary is missing the following required "
137+
f"field(s): {', '.join(f'{field}' for field in missing_fields)}."
138+
)
138139
raise EvaluationException(
139140
message=msg,
140-
internal_message=msg,
141-
target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
142-
category=ErrorCategory.MISSING_FIELD,
141+
category=ErrorCategory.INVALID_VALUE,
143142
blame=ErrorBlame.USER_ERROR,
144143
)
145144

146145
for field_name, expected_type in fields.items():
147146
if isinstance(o[field_name], expected_type):
148147
continue
149148

150-
msg = f"Expected azure_ai_project field {field_name!r} to be of type {expected_type}."
151-
149+
msg = f"Invalid type for field '{field_name}'. Expected {expected_type}, but got {type(o[field_name])}."
152150
raise EvaluationException(
153-
message=f"{msg}. Got {type(o[field_name])}.",
154-
internal_message=msg,
155-
target=ErrorTarget.DIRECT_ATTACK_SIMULATOR,
156-
category=ErrorCategory.MISSING_FIELD,
151+
message=msg,
152+
category=ErrorCategory.INVALID_VALUE,
157153
blame=ErrorBlame.USER_ERROR,
158154
)
159155

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run_client/proxy_client.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def run(
3737
**kwargs
3838
) -> ProxyRun:
3939
flow_to_run = flow
40-
if hasattr(flow, "_to_async"):
40+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true" and hasattr(flow, "_to_async"):
4141
flow_to_run = flow._to_async() # pylint: disable=protected-access
4242

4343
batch_use_async = self._should_batch_use_async(flow_to_run)
@@ -77,7 +77,7 @@ def get_run_summary(self, proxy_run: ProxyRun) -> Dict[str, Any]:
7777

7878
@staticmethod
7979
def _should_batch_use_async(flow):
80-
if os.getenv("PF_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
80+
if os.getenv("AI_EVALS_BATCH_USE_ASYNC", "true").lower() == "true":
8181
if hasattr(flow, "__call__") and inspect.iscoroutinefunction(flow.__call__):
8282
return True
8383
if inspect.iscoroutinefunction(flow):

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

+46-37
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
from azure.ai.evaluation._common.math import list_sum
1717
from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException
18+
from azure.ai.evaluation._common.utils import validate_azure_ai_project
1819

1920
from .._constants import (
2021
CONTENT_SAFETY_DEFECT_RATE_THRESHOLD_DEFAULT,
@@ -292,76 +293,85 @@ def _validate_columns_for_evaluators(
292293

293294
def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_project, evaluation_name):
294295
if data is None:
295-
msg = "data parameter must be provided for evaluation."
296+
msg = "The 'data' parameter is required for evaluation."
296297
raise EvaluationException(
297298
message=msg,
298-
internal_message=msg,
299299
target=ErrorTarget.EVALUATE,
300-
category=ErrorCategory.MISSING_FIELD,
300+
category=ErrorCategory.INVALID_VALUE,
301+
blame=ErrorBlame.USER_ERROR,
302+
)
303+
if not isinstance(data, (os.PathLike, str)):
304+
msg = "The 'data' parameter must be a string or a path-like object."
305+
raise EvaluationException(
306+
message=msg,
307+
target=ErrorTarget.EVALUATE,
308+
category=ErrorCategory.INVALID_VALUE,
309+
blame=ErrorBlame.USER_ERROR,
310+
)
311+
if not os.path.exists(data):
312+
msg = f"The input data file path '{data}' does not exist."
313+
raise EvaluationException(
314+
message=msg,
315+
target=ErrorTarget.EVALUATE,
316+
category=ErrorCategory.INVALID_VALUE,
301317
blame=ErrorBlame.USER_ERROR,
302318
)
303319

304320
if target is not None:
305321
if not callable(target):
306-
msg = "target parameter must be a callable function."
322+
msg = "The 'target' parameter must be a callable function."
307323
raise EvaluationException(
308324
message=msg,
309-
internal_message=msg,
310325
target=ErrorTarget.EVALUATE,
311326
category=ErrorCategory.INVALID_VALUE,
312327
blame=ErrorBlame.USER_ERROR,
313328
)
314329

315-
if data is not None:
316-
if not isinstance(data, str):
317-
msg = "data parameter must be a string."
318-
raise EvaluationException(
319-
message=msg,
320-
internal_message=msg,
321-
target=ErrorTarget.EVALUATE,
322-
category=ErrorCategory.INVALID_VALUE,
323-
blame=ErrorBlame.USER_ERROR,
324-
)
330+
if not evaluators:
331+
msg = "The 'evaluators' parameter is required and cannot be None or empty."
332+
raise EvaluationException(
333+
message=msg,
334+
target=ErrorTarget.EVALUATE,
335+
category=ErrorCategory.INVALID_VALUE,
336+
blame=ErrorBlame.USER_ERROR,
337+
)
338+
if not isinstance(evaluators, dict):
339+
msg = "The 'evaluators' parameter must be a dictionary."
340+
raise EvaluationException(
341+
message=msg,
342+
target=ErrorTarget.EVALUATE,
343+
category=ErrorCategory.INVALID_VALUE,
344+
blame=ErrorBlame.USER_ERROR,
345+
)
325346

326-
if evaluators is not None:
327-
if not isinstance(evaluators, dict):
328-
msg = "evaluators parameter must be a dictionary."
347+
if output_path is not None:
348+
if not isinstance(output_path, (os.PathLike, str)):
349+
msg = "The 'output_path' parameter must be a string or a path-like object."
329350
raise EvaluationException(
330351
message=msg,
331-
internal_message=msg,
332352
target=ErrorTarget.EVALUATE,
333353
category=ErrorCategory.INVALID_VALUE,
334354
blame=ErrorBlame.USER_ERROR,
335355
)
336356

337-
if output_path is not None:
338-
if not isinstance(output_path, str):
339-
msg = "output_path parameter must be a string."
357+
output_dir = output_path if os.path.isdir(output_path) else os.path.dirname(output_path)
358+
if not os.path.exists(output_dir):
359+
msg = f"The output directory '{output_dir}' does not exist. Please create the directory manually."
340360
raise EvaluationException(
341361
message=msg,
342-
internal_message=msg,
343362
target=ErrorTarget.EVALUATE,
344363
category=ErrorCategory.INVALID_VALUE,
345364
blame=ErrorBlame.USER_ERROR,
346365
)
347366

348367
if azure_ai_project is not None:
349-
if not isinstance(azure_ai_project, Dict):
350-
msg = "azure_ai_project parameter must be a dictionary."
351-
raise EvaluationException(
352-
message=msg,
353-
internal_message=msg,
354-
target=ErrorTarget.EVALUATE,
355-
category=ErrorCategory.INVALID_VALUE,
356-
blame=ErrorBlame.USER_ERROR,
357-
)
368+
validate_azure_ai_project(azure_ai_project)
358369

359370
if evaluation_name is not None:
360-
if not isinstance(evaluation_name, str):
361-
msg = "evaluation_name parameter must be a string."
371+
if not isinstance(evaluation_name, str) or not evaluation_name.strip():
372+
msg = "The 'evaluation_name' parameter must be a non-empty string."
362373
raise EvaluationException(
363374
message=msg,
364-
internal_message=msg,
365375
target=ErrorTarget.EVALUATE,
366376
category=ErrorCategory.INVALID_VALUE,
367377
blame=ErrorBlame.USER_ERROR,
@@ -371,8 +381,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
371381
initial_data_df = pd.read_json(data, lines=True)
372382
except Exception as e:
373383
raise EvaluationException(
374-
message=f"Failed to load data from {data}. Confirm that it is valid jsonl data. Error: {str(e)}.",
375-
internal_message="Failed to load data. Confirm that it is valid jsonl data.",
384+
message=f"Unable to load data from '{data}'. Please ensure the input is valid JSONL format. Detailed error: {e}.",
376385
target=ErrorTarget.EVALUATE,
377386
category=ErrorCategory.INVALID_VALUE,
378387
blame=ErrorBlame.USER_ERROR,

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -219,9 +219,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
219219
singletons = {key: kwargs.get(key, None) for key in self._singleton_inputs}
220220
# Check that both conversation and other inputs aren't set
221221
if conversation is not None and any(singletons.values()):
222+
msg = f"{type(self).__name__}: Cannot provide both 'conversation' and individual inputs at the same time."
222223
raise EvaluationException(
223-
message="Invalid input",
224-
internal_message=f"Both conversation and individual inputs were provided to {type(self).__name__}",
224+
message=msg,
225225
blame=ErrorBlame.USER_ERROR,
226226
category=ErrorCategory.INVALID_VALUE,
227227
target=ErrorTarget.CONVERSATION,
@@ -233,9 +233,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs) -> Union[List[Dict], List[Deri
233233
if all(value is not None for value in singletons.values()):
234234
return [singletons] # TODO loosen requirements to allow for optional singletons?
235235
# Missing input
236+
msg = f"{type(self).__name__}: Either 'conversation' or individual inputs must be provided."
236237
raise EvaluationException(
237-
message="Missing input",
238-
internal_message=f"Neither conversation nor individual inputs provided to {type(self).__name__}.",
238+
message=msg,
239239
blame=ErrorBlame.USER_ERROR,
240240
category=ErrorCategory.INVALID_VALUE,
241241
target=ErrorTarget.CONVERSATION,

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file):
207207
@pytest.mark.performance_test
208208
@pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr")
209209
def test_evaluate_with_async_enabled_evaluator(self, model_config, data_file):
210-
os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "true"
210+
os.environ["AI_EVALS_BATCH_USE_ASYNC"] = "true"
211211
fluency_eval = FluencyEvaluator(model_config)
212212

213213
start_time = time.time()
@@ -231,7 +231,7 @@ def test_evaluate_with_async_enabled_evaluator(self, model_config, data_file):
231231
assert "outputs.fluency.gpt_fluency" in row_result_df.columns.to_list()
232232
assert "fluency.gpt_fluency" in metrics.keys()
233233
assert duration < 10, f"evaluate API call took too long: {duration} seconds"
234-
os.environ.pop("PF_EVALS_BATCH_USE_ASYNC")
234+
os.environ.pop("AI_EVALS_BATCH_USE_ASYNC")
235235

236236
@pytest.mark.parametrize(
237237
"use_pf_client,function,column",

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_built_in_evaluator.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,6 @@ def test_fluency_evaluator_empty_string(self, mock_model_config):
3838
with pytest.raises(EvaluationException) as exc_info:
3939
fluency_eval(query="What is the capital of Japan?", response=None)
4040

41-
assert "Missing input" in exc_info.value.args[0]
41+
assert (
42+
"FluencyEvaluator: Either 'conversation' or individual inputs must be provided." in exc_info.value.args[0]
43+
)

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

+35-6
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,14 @@ def _question_answer_override_target(query, response):
111111
@pytest.mark.usefixtures("mock_model_config")
112112
@pytest.mark.unittest
113113
class TestEvaluate:
114-
def test_evaluate_evaluators_not_a_dict(self, mock_model_config):
114+
def test_evaluate_evaluators_not_a_dict(self, mock_model_config, questions_file):
115115
with pytest.raises(EvaluationException) as exc_info:
116116
evaluate(
117-
data="data",
117+
data=questions_file,
118118
evaluators=[GroundednessEvaluator(model_config=mock_model_config)],
119119
)
120120

121-
assert "evaluators parameter must be a dictionary." in exc_info.value.args[0]
121+
assert "The 'evaluators' parameter must be a dictionary." in exc_info.value.args[0]
122122

123123
def test_evaluate_invalid_data(self, mock_model_config):
124124
with pytest.raises(EvaluationException) as exc_info:
@@ -127,7 +127,26 @@ def test_evaluate_invalid_data(self, mock_model_config):
127127
evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
128128
)
129129

130-
assert "data parameter must be a string." in exc_info.value.args[0]
130+
assert "The 'data' parameter must be a string or a path-like object." in exc_info.value.args[0]
131+
132+
def test_evaluate_data_not_exist(self, mock_model_config):
133+
with pytest.raises(EvaluationException) as exc_info:
134+
evaluate(
135+
data="not_exist.jsonl",
136+
evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
137+
)
138+
139+
assert "The input data file path 'not_exist.jsonl' does not exist." in exc_info.value.args[0]
140+
141+
def test_target_not_callable(self, mock_model_config, questions_file):
142+
with pytest.raises(EvaluationException) as exc_info:
143+
evaluate(
144+
data=questions_file,
145+
evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
146+
target="not_callable",
147+
)
148+
149+
assert "The 'target' parameter must be a callable function." in exc_info.value.args[0]
131150

132151
def test_evaluate_invalid_jsonl_data(self, mock_model_config, invalid_jsonl_file):
133152
with pytest.raises(EvaluationException) as exc_info:
@@ -136,8 +155,8 @@ def test_evaluate_invalid_jsonl_data(self, mock_model_config, invalid_jsonl_file
136155
evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
137156
)
138157

139-
assert "Failed to load data from " in exc_info.value.args[0]
140-
assert "Confirm that it is valid jsonl data." in exc_info.value.args[0]
158+
assert "Unable to load data from " in exc_info.value.args[0]
159+
assert "Please ensure the input is valid JSONL format." in exc_info.value.args[0]
141160

142161
def test_evaluate_missing_required_inputs(self, missing_columns_jsonl_file):
143162
with pytest.raises(EvaluationException) as exc_info:
@@ -367,6 +386,16 @@ def test_renaming_column(self):
367386
df_actuals = _rename_columns_conditionally(df)
368387
assert_frame_equal(df_actuals.sort_index(axis=1), df_expected.sort_index(axis=1))
369388

389+
def test_evaluate_output_dir_not_exist(self, mock_model_config, questions_file):
390+
with pytest.raises(EvaluationException) as exc_info:
391+
evaluate(
392+
data=questions_file,
393+
evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
394+
output_path="./not_exist_dir/output.jsonl",
395+
)
396+
397+
assert "The output directory './not_exist_dir' does not exist." in exc_info.value.args[0]
398+
370399
@pytest.mark.parametrize("use_pf_client", [True, False])
371400
def test_evaluate_output_path(self, evaluate_test_data_jsonl_file, tmpdir, use_pf_client):
372401
output_path = os.path.join(tmpdir, "eval_test_results.jsonl")

0 commit comments

Comments
 (0)