@@ -178,17 +178,18 @@ export function calculateRanks(metrics: ModelMetrics[]): ModelMetrics[] {
178
178
/**
179
179
* Calculates a comprehensive exactness score for a model based on validation results.
180
180
*
181
- * The score combines:
182
- * 1. Match rate (what percentage of queries produced valid results)
183
- * 2. Quality of matches (how accurate the results are)
184
- * 3. Penalty for failures (reduces score for queries that failed)
181
+ * This function calculates the average of individual exactness scores across all questions,
182
+ * which provides a more accurate representation than using average distance metrics.
185
183
*
186
- * All calculations are done on a 0-1 scale internally for consistency,
187
- * then converted to 0-100 scale for final output.
184
+ * The score properly accounts for:
185
+ * 1. Individual question scores (including perfect 100 scores)
186
+ * 2. Failed queries (scored as 0)
187
+ * 3. Exact match bonus for perfect accuracy
188
188
*/
189
189
function blendedExactnessScore ( provider : string , model : string ) {
190
190
const modelKey = `${ provider } /${ model } ` ;
191
191
192
+ // Validate that model stats exist
192
193
if (
193
194
! validationSummaries . modelStats [
194
195
modelKey as keyof typeof validationSummaries . modelStats
@@ -198,35 +199,61 @@ function blendedExactnessScore(provider: string, model: string) {
198
199
return 0 ;
199
200
}
200
201
201
- const { totalMatches, exactMatches, avgExactDistance, avgNumericDistance, avgFScore } =
202
- validationSummaries . modelStats [
203
- modelKey as keyof typeof validationSummaries . modelStats
204
- ] ;
202
+ const modelStats = validationSummaries . modelStats [
203
+ modelKey as keyof typeof validationSummaries . modelStats
204
+ ] ;
205
205
206
- // Calculate match rates (0-1 scale)
207
- const totalMatchRate = totalMatches / validationSummaries . totalQuestions ;
208
- const exactMatchRate = exactMatches / validationSummaries . totalQuestions ;
209
- const failedMatchRate = 1 - totalMatchRate ;
206
+ // Validate required fields exist and are numbers
207
+ if (
208
+ typeof modelStats . totalMatches !== 'number' ||
209
+ typeof modelStats . exactMatches !== 'number' ||
210
+ typeof validationSummaries . totalQuestions !== 'number' ||
211
+ validationSummaries . totalQuestions === 0
212
+ ) {
213
+ console . log ( `Invalid validation data for ${ modelKey } ` ) ;
214
+ return 0 ;
215
+ }
216
+
217
+ const { totalMatches, exactMatches } = modelStats ;
210
218
211
- // Calculate quality score for successful matches (0-1 scale)
212
- // This already accounts for exact matches through the distance metrics
213
- const qualityScore = blendScore ( avgExactDistance , avgNumericDistance , avgFScore ) / 100 ;
219
+ // Calculate individual exactness scores for all questions
220
+ const individualScores : number [ ] = [ ] ;
214
221
215
- // Calculate comprehensive exactness score
216
- // Base score: successful matches weighted by their quality
217
- const baseScore = totalMatchRate * qualityScore ;
222
+ // Get all question keys from validation results
223
+ const questionKeys = Object . keys ( validationResults ) . filter ( key => key !== '_summary' ) ;
218
224
219
- // Apply penalty for failed matches (reduces score proportionally)
220
- // Using a moderate penalty to avoid overly harsh scoring
221
- const failurePenalty = failedMatchRate * 0.3 ;
225
+ // Validate we have questions to process
226
+ if ( questionKeys . length === 0 ) {
227
+ console . log ( `No questions found in validation results for ${ modelKey } ` ) ;
228
+ return 0 ;
229
+ }
222
230
223
- // Apply bonus for exact matches (additional reward for perfect accuracy)
224
- const exactMatchBonus = exactMatchRate * 0.1 ; // 10% bonus for exact matches
231
+ for ( const question of questionKeys ) {
232
+ const individualScore = getExactnessScore ( provider , model , question ) ;
233
+ individualScores . push ( individualScore ) ;
234
+ }
235
+
236
+ // Calculate average of individual scores (safe division)
237
+ const avgIndividualScore = individualScores . length > 0
238
+ ? individualScores . reduce ( ( sum , score ) => sum + score , 0 ) / individualScores . length
239
+ : 0 ;
240
+
241
+ // Apply exact match bonus (safe division)
242
+ const exactMatchRate = exactMatches / validationSummaries . totalQuestions ;
225
243
226
- const comprehensiveScore = Math . max ( 0 , baseScore - failurePenalty + exactMatchBonus ) ;
244
+ // Calculate bonus that ensures final score never exceeds 100
245
+ const maxPossibleBonus = Math . max ( 0 , 100 - avgIndividualScore ) ;
246
+ const exactMatchBonus = exactMatchRate * Math . min ( 5 , maxPossibleBonus ) ;
247
+
248
+ const finalScore = avgIndividualScore + exactMatchBonus ;
249
+
250
+ // Validate final score is a valid number
251
+ if ( ! isFinite ( finalScore ) ) {
252
+ console . log ( `Invalid final score calculated for ${ modelKey } : ${ finalScore } ` ) ;
253
+ return 0 ;
254
+ }
227
255
228
- // Convert back to 0-100 scale for consistency with other scores
229
- return Math . round ( comprehensiveScore * 100 ) ;
256
+ return Math . round ( finalScore ) ;
230
257
}
231
258
232
259
/**
0 commit comments