Skip to content

Commit 1fcb0e5

Browse files
committed
feat: implement blended exactness score for model evaluation
1 parent 783f98f commit 1fcb0e5

File tree

1 file changed

+32
-10
lines changed

1 file changed

+32
-10
lines changed

src/src/lib/eval.ts

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,17 @@ export function calculateRanks(metrics: ModelMetrics[]): ModelMetrics[] {
175175
});
176176
}
177177

178+
/**
179+
* Calculates a comprehensive exactness score for a model based on validation results.
180+
*
181+
* The score combines:
182+
* 1. Match rate (what percentage of queries produced valid results)
183+
* 2. Quality of matches (how accurate the results are)
184+
* 3. Penalty for failures (reduces score for queries that failed)
185+
*
186+
* All calculations are done on a 0-1 scale internally for consistency,
187+
* then converted to 0-100 scale for final output.
188+
*/
178189
function blendedExactnessScore(provider: string, model: string) {
179190
const modelKey = `${provider}/${model}`;
180191

@@ -192,29 +203,40 @@ function blendedExactnessScore(provider: string, model: string) {
192203
modelKey as keyof typeof validationSummaries.modelStats
193204
];
194205

195-
// Calculate match rates
206+
// Calculate match rates (0-1 scale)
196207
const totalMatchRate = totalMatches / validationSummaries.totalQuestions;
197208
const exactMatchRate = exactMatches / validationSummaries.totalQuestions;
198209
const failedMatchRate = 1 - totalMatchRate;
199210

200-
// Calculate quality score for successful matches
201-
const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore);
211+
// Calculate quality score for successful matches (0-1 scale)
212+
// This already accounts for exact matches through the distance metrics
213+
const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore) / 100;
202214

203-
// Calculate comprehensive exactness score with penalties for failures
204-
// Base score from successful matches
215+
// Calculate comprehensive exactness score
216+
// Base score: successful matches weighted by their quality
205217
const baseScore = totalMatchRate * qualityScore;
206218

207-
// Penalty for failed matches (each failed match reduces score)
208-
const failurePenalty = failedMatchRate * 1; // score here if needed
219+
// Apply penalty for failed matches (reduces score proportionally)
220+
// Using a moderate penalty to avoid overly harsh scoring
221+
const failurePenalty = failedMatchRate * 0.3;
209222

210-
// Bonus for exact matches
211-
const exactMatchBonus = exactMatchRate * 1; // score here if needed
223+
// Apply bonus for exact matches (additional reward for perfect accuracy)
224+
const exactMatchBonus = exactMatchRate * 0.1; // 10% bonus for exact matches
212225

213226
const comprehensiveScore = Math.max(0, baseScore - failurePenalty + exactMatchBonus);
214227

215-
return comprehensiveScore;
228+
// Convert back to 0-100 scale for consistency with other scores
229+
return Math.round(comprehensiveScore * 100);
216230
}
217231

232+
/**
233+
* Blends different distance metrics into a single quality score.
234+
*
235+
* @param exact - Exact distance metric (0 = perfect match, 1 = complete mismatch)
236+
* @param numeric - Numeric distance metric (0 = perfect match, 1 = complete mismatch)
237+
* @param fscore - F-score metric (0 = worst, 1 = best)
238+
* @returns Quality score on 0-100 scale (100 = perfect)
239+
*/
218240
function blendScore(exact: number, numeric: number, fscore: number) {
219241
return 100 * (0.65 * (1 - exact) + 0.25 * (1 - numeric) + 0.1 * fscore);
220242
}

0 commit comments

Comments
 (0)