3
3
from __future__ import annotations
4
4
5
5
import typing_extensions
6
- from typing import Dict , List , Optional
6
+ from typing import Dict , List , Iterable , Optional
7
7
from typing_extensions import Literal
8
8
9
9
import httpx
@@ -436,6 +436,7 @@ def validate(
436
436
custom_eval_thresholds : Optional [Dict [str , float ]] | NotGiven = NOT_GIVEN ,
437
437
custom_metadata : Optional [object ] | NotGiven = NOT_GIVEN ,
438
438
eval_scores : Optional [Dict [str , float ]] | NotGiven = NOT_GIVEN ,
439
+ messages : Optional [Iterable [project_validate_params .Message ]] | NotGiven = NOT_GIVEN ,
439
440
options : Optional [project_validate_params .Options ] | NotGiven = NOT_GIVEN ,
440
441
quality_preset : Literal ["best" , "high" , "medium" , "low" , "base" ] | NotGiven = NOT_GIVEN ,
441
442
task : Optional [str ] | NotGiven = NOT_GIVEN ,
@@ -465,6 +466,10 @@ def validate(
465
466
eval_scores: Scores assessing different aspects of the RAG system. If not provided, TLM will
466
467
be used to generate scores.
467
468
469
+ messages: Optional message history to provide conversation context for the query. Used to
470
+ rewrite query into a self-contained version of itself. If not provided, the
471
+ query will be treated as self-contained.
472
+
468
473
options: Typed dict of advanced configuration options for the Trustworthy Language Model.
469
474
Many of these configurations are determined by the quality preset selected
470
475
(learn about quality presets in the TLM [initialization method](./#class-tlm)).
@@ -490,27 +495,24 @@ def validate(
490
495
`use_self_reflection` = True.
491
496
- **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
492
497
`use_self_reflection` = False. When using `get_trustworthiness_score()` on
493
- "base" preset, a cheaper self-reflection will be used to compute the
494
- trustworthiness score.
495
-
496
- By default, the TLM uses the "medium" quality preset. The default base LLM
497
- `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets.
498
- You can set custom values for these arguments regardless of the quality preset
499
- specified.
500
-
501
- Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
502
- "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
503
- "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
504
- "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
505
- "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
506
- "gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
507
- faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
508
- "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
509
- "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
510
- "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
511
- for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
512
- "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
513
- "gpt-4.1-nano", "nova-micro".
498
+ "base" preset, a faster self-reflection is employed.
499
+
500
+ By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
501
+ `model`, and `max_tokens` is set to 512. You can set custom values for these
502
+ arguments regardless of the quality preset specified.
503
+
504
+ Args: model ({"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3",
505
+ "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4",
506
+ "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0",
507
+ "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet",
508
+ "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"},
509
+ default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield
510
+ better results, faster models yield faster results). - Models still in beta:
511
+ "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview",
512
+ "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",
513
+ "claude-3.5-haiku". - Recommended models for accuracy: "gpt-4.1", "o4-mini",
514
+ "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models for low
515
+ latency/costs: "gpt-4.1-nano", "nova-micro".
514
516
515
517
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
516
518
Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
@@ -536,7 +538,7 @@ def validate(
536
538
537
539
similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
538
540
trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
539
- Supported similarity measures include: "semantic" (based on natural language inference),
541
+ Supported similarity measures include - "semantic" (based on natural language inference),
540
542
"embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
541
543
"code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
542
544
and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
@@ -588,6 +590,7 @@ def validate(
588
590
"custom_eval_thresholds" : custom_eval_thresholds ,
589
591
"custom_metadata" : custom_metadata ,
590
592
"eval_scores" : eval_scores ,
593
+ "messages" : messages ,
591
594
"options" : options ,
592
595
"quality_preset" : quality_preset ,
593
596
"task" : task ,
@@ -985,6 +988,7 @@ async def validate(
985
988
custom_eval_thresholds : Optional [Dict [str , float ]] | NotGiven = NOT_GIVEN ,
986
989
custom_metadata : Optional [object ] | NotGiven = NOT_GIVEN ,
987
990
eval_scores : Optional [Dict [str , float ]] | NotGiven = NOT_GIVEN ,
991
+ messages : Optional [Iterable [project_validate_params .Message ]] | NotGiven = NOT_GIVEN ,
988
992
options : Optional [project_validate_params .Options ] | NotGiven = NOT_GIVEN ,
989
993
quality_preset : Literal ["best" , "high" , "medium" , "low" , "base" ] | NotGiven = NOT_GIVEN ,
990
994
task : Optional [str ] | NotGiven = NOT_GIVEN ,
@@ -1014,6 +1018,10 @@ async def validate(
1014
1018
eval_scores: Scores assessing different aspects of the RAG system. If not provided, TLM will
1015
1019
be used to generate scores.
1016
1020
1021
+ messages: Optional message history to provide conversation context for the query. Used to
1022
+ rewrite query into a self-contained version of itself. If not provided, the
1023
+ query will be treated as self-contained.
1024
+
1017
1025
options: Typed dict of advanced configuration options for the Trustworthy Language Model.
1018
1026
Many of these configurations are determined by the quality preset selected
1019
1027
(learn about quality presets in the TLM [initialization method](./#class-tlm)).
@@ -1039,27 +1047,24 @@ async def validate(
1039
1047
`use_self_reflection` = True.
1040
1048
- **base:** `num_candidate_responses` = 1, `num_consistency_samples` = 0,
1041
1049
`use_self_reflection` = False. When using `get_trustworthiness_score()` on
1042
- "base" preset, a cheaper self-reflection will be used to compute the
1043
- trustworthiness score.
1044
-
1045
- By default, the TLM uses the "medium" quality preset. The default base LLM
1046
- `model` used is "gpt-4o-mini", and `max_tokens` is 512 for all quality presets.
1047
- You can set custom values for these arguments regardless of the quality preset
1048
- specified.
1049
-
1050
- Args: model ({"gpt-4o-mini", "gpt-4o", "gpt-4.1", "gpt-4.1-mini",
1051
- "gpt-4.1-nano", "o4-mini", "o3", "o3-mini", "o1", "o1-mini", "gpt-4",
1052
- "gpt-4.5-preview", "gpt-3.5-turbo-16k", "claude-3.7-sonnet",
1053
- "claude-3.5-sonnet-v2", "claude-3.5-sonnet", "claude-3.5-haiku",
1054
- "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"}, default =
1055
- "gpt-4o-mini"): Underlying base LLM to use (better models yield better results,
1056
- faster models yield faster/cheaper results). - Models still in beta: "o3", "o1",
1057
- "o4-mini", "o3-mini", "o1-mini", "gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano",
1058
- "gpt-4.5-preview", "claude-3.7-sonnet", "claude-3.5-sonnet-v2",
1059
- "claude-3.5-haiku", "nova-micro", "nova-lite", "nova-pro". - Recommended models
1060
- for accuracy: "gpt-4.1", "o4-mini", "o3", "claude-3.7-sonnet",
1061
- "claude-3.5-sonnet-v2". - Recommended models for low latency/costs:
1062
- "gpt-4.1-nano", "nova-micro".
1050
+ "base" preset, a faster self-reflection is employed.
1051
+
1052
+ By default, TLM uses the: "medium" `quality_preset`, "gpt-4.1-mini" base
1053
+ `model`, and `max_tokens` is set to 512. You can set custom values for these
1054
+ arguments regardless of the quality preset specified.
1055
+
1056
+ Args: model ({"gpt-4.1", "gpt-4.1-mini", "gpt-4.1-nano", "o4-mini", "o3",
1057
+ "gpt-4.5-preview", "gpt-4o-mini", "gpt-4o", "o3-mini", "o1", "o1-mini", "gpt-4",
1058
+ "gpt-3.5-turbo-16k", "claude-opus-4-0", "claude-sonnet-4-0",
1059
+ "claude-3.7-sonnet", "claude-3.5-sonnet-v2", "claude-3.5-sonnet",
1060
+ "claude-3.5-haiku", "claude-3-haiku", "nova-micro", "nova-lite", "nova-pro"},
1061
+ default = "gpt-4.1-mini"): Underlying base LLM to use (better models yield
1062
+ better results, faster models yield faster results). - Models still in beta:
1063
+ "o3", "o1", "o4-mini", "o3-mini", "o1-mini", "gpt-4.5-preview",
1064
+ "claude-opus-4-0", "claude-sonnet-4-0", "claude-3.7-sonnet",
1065
+ "claude-3.5-haiku". - Recommended models for accuracy: "gpt-4.1", "o4-mini",
1066
+ "o3", "claude-opus-4-0", "claude-sonnet-4-0". - Recommended models for low
1067
+ latency/costs: "gpt-4.1-nano", "nova-micro".
1063
1068
1064
1069
max_tokens (int, default = 512): the maximum number of tokens that can be generated in the TLM response (and in internal trustworthiness scoring).
1065
1070
Higher values here may produce better (more reliable) TLM responses and trustworthiness scores, but at higher runtimes/costs.
@@ -1085,7 +1090,7 @@ async def validate(
1085
1090
1086
1091
similarity_measure ({"semantic", "string", "embedding", "embedding_large", "code", "discrepancy"}, default = "semantic"): how the
1087
1092
trustworthiness scoring's consistency algorithm measures similarity between alternative responses considered plausible by the model.
1088
- Supported similarity measures include: "semantic" (based on natural language inference),
1093
+ Supported similarity measures include - "semantic" (based on natural language inference),
1089
1094
"embedding" (based on vector embedding similarity), "embedding_large" (based on a larger embedding model),
1090
1095
"code" (based on model-based analysis designed to compare code), "discrepancy" (based on model-based analysis of possible discrepancies),
1091
1096
and "string" (based on character/word overlap). Set this to "string" for minimal runtimes/costs.
@@ -1137,6 +1142,7 @@ async def validate(
1137
1142
"custom_eval_thresholds" : custom_eval_thresholds ,
1138
1143
"custom_metadata" : custom_metadata ,
1139
1144
"eval_scores" : eval_scores ,
1145
+ "messages" : messages ,
1140
1146
"options" : options ,
1141
1147
"quality_preset" : quality_preset ,
1142
1148
"task" : task ,
0 commit comments