Skip to content

Commit 47c57b2

Browse files
committed
refactor: improve blended exactness score calculation for model evaluation
1 parent 1fcb0e5 commit 47c57b2

File tree

1 file changed

+55
-28
lines changed

1 file changed

+55
-28
lines changed

src/src/lib/eval.ts

Lines changed: 55 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -178,17 +178,18 @@ export function calculateRanks(metrics: ModelMetrics[]): ModelMetrics[] {
178178
/**
179179
* Calculates a comprehensive exactness score for a model based on validation results.
180180
*
181-
* The score combines:
182-
* 1. Match rate (what percentage of queries produced valid results)
183-
* 2. Quality of matches (how accurate the results are)
184-
* 3. Penalty for failures (reduces score for queries that failed)
181+
* This function calculates the average of individual exactness scores across all questions,
182+
* which provides a more accurate representation than using average distance metrics.
185183
*
186-
* All calculations are done on a 0-1 scale internally for consistency,
187-
* then converted to 0-100 scale for final output.
184+
* The score properly accounts for:
185+
* 1. Individual question scores (including perfect 100 scores)
186+
* 2. Failed queries (scored as 0)
187+
* 3. Exact match bonus for perfect accuracy
188188
*/
189189
function blendedExactnessScore(provider: string, model: string) {
190190
const modelKey = `${provider}/${model}`;
191191

192+
// Validate that model stats exist
192193
if (
193194
!validationSummaries.modelStats[
194195
modelKey as keyof typeof validationSummaries.modelStats
@@ -198,35 +199,61 @@ function blendedExactnessScore(provider: string, model: string) {
198199
return 0;
199200
}
200201

201-
const { totalMatches, exactMatches, avgExactDistance, avgNumericDistance, avgFScore } =
202-
validationSummaries.modelStats[
203-
modelKey as keyof typeof validationSummaries.modelStats
204-
];
202+
const modelStats = validationSummaries.modelStats[
203+
modelKey as keyof typeof validationSummaries.modelStats
204+
];
205205

206-
// Calculate match rates (0-1 scale)
207-
const totalMatchRate = totalMatches / validationSummaries.totalQuestions;
208-
const exactMatchRate = exactMatches / validationSummaries.totalQuestions;
209-
const failedMatchRate = 1 - totalMatchRate;
206+
// Validate required fields exist and are numbers
207+
if (
208+
typeof modelStats.totalMatches !== 'number' ||
209+
typeof modelStats.exactMatches !== 'number' ||
210+
typeof validationSummaries.totalQuestions !== 'number' ||
211+
validationSummaries.totalQuestions === 0
212+
) {
213+
console.log(`Invalid validation data for ${modelKey}`);
214+
return 0;
215+
}
216+
217+
const { totalMatches, exactMatches } = modelStats;
210218

211-
// Calculate quality score for successful matches (0-1 scale)
212-
// This already accounts for exact matches through the distance metrics
213-
const qualityScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore) / 100;
219+
// Calculate individual exactness scores for all questions
220+
const individualScores: number[] = [];
214221

215-
// Calculate comprehensive exactness score
216-
// Base score: successful matches weighted by their quality
217-
const baseScore = totalMatchRate * qualityScore;
222+
// Get all question keys from validation results
223+
const questionKeys = Object.keys(validationResults).filter(key => key !== '_summary');
218224

219-
// Apply penalty for failed matches (reduces score proportionally)
220-
// Using a moderate penalty to avoid overly harsh scoring
221-
const failurePenalty = failedMatchRate * 0.3;
225+
// Validate we have questions to process
226+
if (questionKeys.length === 0) {
227+
console.log(`No questions found in validation results for ${modelKey}`);
228+
return 0;
229+
}
222230

223-
// Apply bonus for exact matches (additional reward for perfect accuracy)
224-
const exactMatchBonus = exactMatchRate * 0.1; // 10% bonus for exact matches
231+
for (const question of questionKeys) {
232+
const individualScore = getExactnessScore(provider, model, question);
233+
individualScores.push(individualScore);
234+
}
235+
236+
// Calculate average of individual scores (safe division)
237+
const avgIndividualScore = individualScores.length > 0
238+
? individualScores.reduce((sum, score) => sum + score, 0) / individualScores.length
239+
: 0;
240+
241+
// Apply exact match bonus (safe division)
242+
const exactMatchRate = exactMatches / validationSummaries.totalQuestions;
225243

226-
const comprehensiveScore = Math.max(0, baseScore - failurePenalty + exactMatchBonus);
244+
// Calculate bonus that ensures final score never exceeds 100
245+
const maxPossibleBonus = Math.max(0, 100 - avgIndividualScore);
246+
const exactMatchBonus = exactMatchRate * Math.min(5, maxPossibleBonus);
247+
248+
const finalScore = avgIndividualScore + exactMatchBonus;
249+
250+
// Validate final score is a valid number
251+
if (!isFinite(finalScore)) {
252+
console.log(`Invalid final score calculated for ${modelKey}: ${finalScore}`);
253+
return 0;
254+
}
227255

228-
// Convert back to 0-100 scale for consistency with other scores
229-
return Math.round(comprehensiveScore * 100);
256+
return Math.round(finalScore);
230257
}
231258

232259
/**

0 commit comments

Comments
 (0)