Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 74 additions & 6 deletions src/src/lib/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,21 @@ export function calculateRanks(metrics: ModelMetrics[]): ModelMetrics[] {
});
}

/**
* Calculates a comprehensive exactness score for a model based on validation results.
*
* This function calculates the average of individual exactness scores across all questions,
* which provides a more accurate representation than using average distance metrics.
*
* The score properly accounts for:
* 1. Individual question scores (including perfect 100 scores)
* 2. Failed queries (scored as 0)
* 3. Exact match bonus for perfect accuracy
*/
function blendedExactnessScore(provider: string, model: string) {
const modelKey = `${provider}/${model}`;

// Validate that model stats exist
if (
!validationSummaries.modelStats[
modelKey as keyof typeof validationSummaries.modelStats
Expand All @@ -187,15 +199,71 @@ function blendedExactnessScore(provider: string, model: string) {
return 0;
}

const { avgExactDistance, avgNumericDistance, avgFScore } =
validationSummaries.modelStats[
modelKey as keyof typeof validationSummaries.modelStats
];
const modelStats = validationSummaries.modelStats[
modelKey as keyof typeof validationSummaries.modelStats
];

// Validate required fields exist and are numbers
if (
typeof modelStats.totalMatches !== 'number' ||
typeof modelStats.exactMatches !== 'number' ||
typeof validationSummaries.totalQuestions !== 'number' ||
validationSummaries.totalQuestions === 0
) {
console.log(`Invalid validation data for ${modelKey}`);
return 0;
}

// strong preference for exact, numeric as backup, fscore as minor fallback (it's correlated with jaccard)
return blendScore(avgExactDistance, avgNumericDistance, avgFScore);
const { totalMatches, exactMatches } = modelStats;

// Calculate individual exactness scores for all questions
const individualScores: number[] = [];

// Get all question keys from validation results
const questionKeys = Object.keys(validationResults).filter(key => key !== '_summary');

// Validate we have questions to process
if (questionKeys.length === 0) {
console.log(`No questions found in validation results for ${modelKey}`);
return 0;
}

for (const question of questionKeys) {
const individualScore = getExactnessScore(provider, model, question);
individualScores.push(individualScore);
}

Comment on lines +230 to +235
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new implementation includes 0 scores for questions that models never attempted, unfairly lowering their exactness scores compared to the original calculation method.

View Details
📝 Patch Details
diff --git a/src/src/lib/eval.ts b/src/src/lib/eval.ts
index ee19459..ceff5c1 100644
--- a/src/src/lib/eval.ts
+++ b/src/src/lib/eval.ts
@@ -214,29 +214,21 @@ function blendedExactnessScore(provider: string, model: string) {
     return 0;
   }
 
-  const { totalMatches, exactMatches } = modelStats;
+  const { exactMatches, avgExactDistance, avgNumericDistance, avgFScore } = modelStats;
 
-  // Calculate individual exactness scores for all questions
-  const individualScores: number[] = [];
-  
-  // Get all question keys from validation results
-  const questionKeys = Object.keys(validationResults).filter(key => key !== '_summary');
-  
-  // Validate we have questions to process
-  if (questionKeys.length === 0) {
-    console.log(`No questions found in validation results for ${modelKey}`);
+  // Validate required aggregate fields exist and are numbers
+  if (
+    typeof avgExactDistance !== 'number' ||
+    typeof avgNumericDistance !== 'number' ||
+    typeof avgFScore !== 'number'
+  ) {
+    console.log(`Invalid aggregate distance data for ${modelKey}`);
     return 0;
   }
-  
-  for (const question of questionKeys) {
-    const individualScore = getExactnessScore(provider, model, question);
-    individualScores.push(individualScore);
-  }
-  
-  // Calculate average of individual scores (safe division)
-  const avgIndividualScore = individualScores.length > 0 
-    ? individualScores.reduce((sum, score) => sum + score, 0) / individualScores.length
-    : 0;
+
+  // Use pre-calculated aggregates that only include questions the model attempted
+  // This ensures models aren't penalized for unattempted questions
+  const avgIndividualScore = blendScore(avgExactDistance, avgNumericDistance, avgFScore);
   
   // Apply exact match bonus (safe division)
   const exactMatchRate = exactMatches / validationSummaries.totalQuestions;

Analysis

Unfair scoring penalty in blendedExactnessScore() for models with unattempted questions

What fails: blendedExactnessScore() in src/src/lib/eval.ts iterates through ALL 50 questions and calls getExactnessScore(), which returns 0 for unattempted questions, unfairly lowering model scores compared to using pre-calculated aggregates

How to reproduce:

  1. Check models with unattempted questions (e.g., deepseek/deepseek-chat-v3-0324:free has 4 unattempted questions)
  2. Current implementation averages scores across all 50 questions (including 0s for unattempted)
  3. Pre-calculated aggregates (avgExactDistance, avgNumericDistance, avgFScore) only include attempted questions

Result: Models like deepseek/deepseek-chat-v3-0324:free get artificially low scores (48 vs 56 points) because unattempted questions count as 0 instead of being excluded from calculation

Expected: Use pre-calculated aggregate statistics that only consider questions the model actually attempted, matching the original benchmark methodology that updates stats only when modelResult exists

// Calculate average of individual scores (safe division)
const avgIndividualScore = individualScores.length > 0
? individualScores.reduce((sum, score) => sum + score, 0) / individualScores.length
: 0;

// Apply exact match bonus (safe division)
const exactMatchRate = exactMatches / validationSummaries.totalQuestions;
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Mismatched Counts Skew Exact Match Rate

The blendedExactnessScore function calculates the exact match rate using validationSummaries.totalQuestions, but processes individual scores based on questionKeys. If these counts don't align, the exact match rate will be inaccurate, affecting the final blended score.

Fix in Cursor Fix in Web


// Calculate bonus that ensures final score never exceeds 100
const maxPossibleBonus = Math.max(0, 100 - avgIndividualScore);
const exactMatchBonus = exactMatchRate * Math.min(5, maxPossibleBonus);

const finalScore = avgIndividualScore + exactMatchBonus;

// Validate final score is a valid number
if (!isFinite(finalScore)) {
console.log(`Invalid final score calculated for ${modelKey}: ${finalScore}`);
return 0;
}

return Math.round(finalScore);
}

/**
* Blends different distance metrics into a single quality score.
*
* @param exact - Exact distance metric (0 = perfect match, 1 = complete mismatch)
* @param numeric - Numeric distance metric (0 = perfect match, 1 = complete mismatch)
* @param fscore - F-score metric (0 = worst, 1 = best)
* @returns Quality score on 0-100 scale (100 = perfect)
*/
function blendScore(exact: number, numeric: number, fscore: number) {
return 100 * (0.65 * (1 - exact) + 0.25 * (1 - numeric) + 0.1 * fscore);
}
Expand Down