Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions EVAL_WORKFLOW.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,3 +227,26 @@ Your solution should:
- Group related functionality together
- Include type imports from generated files
- Add helpful comments for complex logic

## AI grading

The evals include a lightweight AI grader that reviews the generated project for each eval and provides concise reasoning on pass/fail.

- The grader builds a prompt from `TASK.txt` plus a manifest of files from the generated output directory and asks a model to decide pass/fail with reasoning.
- The helper logs reasoning on every run and, on failure, throws an error with that reasoning so it appears directly in the test output and in `run.log`.

### How to use in a grader test

Add a single standardized test using the helper:

```ts
import { createAIGraderTest } from "../../../grader/aiGrader";

// Basic usage (default name and 60s timeout)
createAIGraderTest(import.meta.url);

// Optional: custom name/timeout
createAIGraderTest(import.meta.url, "AI grader assessment", 60000);
```

That’s it. On failure, the thrown error message will include the AI reasoning. The same reasoning is logged to the console and captured in the eval’s `run.log` by the default Vitest reporter.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ Output:

Optional Convex summary posting (still local mode): set both `CONVEX_EVAL_ENDPOINT` and `CONVEX_AUTH_TOKEN`.

## AI grading helper

Grader tests can include an AI-based assessment that provides concise reasoning on failure. See the "AI grading" section in `EVAL_WORKFLOW.md` for details and usage with `createAIGraderTest(import.meta.url)`.

## Rerunning grading

After running the evals, you may want to dig into a particular test failure. You can use the `run_grader.py` script to grade the evaluations again without regenerating them:
Expand Down
25 changes: 25 additions & 0 deletions bun.lock
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
"": {
"name": "evals-convex",
"dependencies": {
"@ai-sdk/openai": "^2.0.19",
"@types/bun": "^1.2.20",
"@types/node": "^22.12.0",
"ai": "^5.0.22",
"convex": "^1.18.2",
"dotenv": "^17.2.1",
"prettier": "^3.4.2",
"typescript-eslint": "^8.23.0",
"vitest": "^3.0.2",
"zod": "^3.23.8",
},
"devDependencies": {
"@eslint/eslintrc": "^3.2.0",
Expand All @@ -27,6 +30,14 @@
},
},
"packages": {
"@ai-sdk/gateway": ["@ai-sdk/[email protected]", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-ErwWS3sPOuWy42eE3AVxlKkTa1XjjKBEtNCOylVKMO5KNyz5qie8QVlLYbULOG56dtxX4zTKX3rQNJudplhcmQ=="],

"@ai-sdk/openai": ["@ai-sdk/[email protected]", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-sG3/IVaPvV7Vn6513I1bcJILHpLCXbVif2ht6CyROcB9FzXCJe2K5uRbAg30HWsdCEe7xu4OAWtMK6yWTOcsSA=="],

"@ai-sdk/provider": ["@ai-sdk/[email protected]", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="],

"@ai-sdk/provider-utils": ["@ai-sdk/[email protected]", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.3", "zod-to-json-schema": "^3.24.1" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-HliwB/yzufw3iwczbFVE2Fiwf1XqROB/I6ng8EKUsPM5+2wnIa8f4VbljZcDx+grhFrPV+PnRZH7zBqi8WZM7Q=="],

"@ampproject/remapping": ["@ampproject/[email protected]", "", { "dependencies": { "@jridgewell/gen-mapping": "^0.3.5", "@jridgewell/trace-mapping": "^0.3.24" } }, "sha512-30iZtAPgz+LTIYoeivqYo853f02jBYSd5uGnGpkFV0M3xOt9aN73erkgYAmZU43x4VfqcnLxW9Kpg3R5LC4YYw=="],

"@babel/code-frame": ["@babel/[email protected]", "", { "dependencies": { "@babel/helper-validator-identifier": "^7.25.9", "js-tokens": "^4.0.0", "picocolors": "^1.0.0" } }, "sha512-RJlIHRueQgwWitWgF8OdFYGZX328Ax5BCemNGlqHfplnRT9ESi8JkFlvaVYbS+UubVY6dpv87Fs2u5M29iNFVQ=="],
Expand Down Expand Up @@ -157,6 +168,8 @@

"@nodelib/fs.walk": ["@nodelib/[email protected]", "", { "dependencies": { "@nodelib/fs.scandir": "2.1.5", "fastq": "^1.6.0" } }, "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg=="],

"@opentelemetry/api": ["@opentelemetry/[email protected]", "", {}, "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg=="],

"@rollup/rollup-android-arm-eabi": ["@rollup/[email protected]", "", { "os": "android", "cpu": "arm" }, "sha512-9NrR4033uCbUBRgvLcBrJofa2KY9DzxL2UKZ1/4xA/mnTNyhZCWBuD8X3tPm1n4KxcgaraOYgrFKSgwjASfmlA=="],

"@rollup/rollup-android-arm64": ["@rollup/[email protected]", "", { "os": "android", "cpu": "arm64" }, "sha512-iBbODqT86YBFHajxxF8ebj2hwKm1k8PTBQSojSt3d1FFt1gN+xf4CowE47iN0vOSdnd+5ierMHBbu/rHc7nq5g=="],
Expand Down Expand Up @@ -195,6 +208,8 @@

"@rollup/rollup-win32-x64-msvc": ["@rollup/[email protected]", "", { "os": "win32", "cpu": "x64" }, "sha512-ul8rnCsUumNln5YWwz0ted2ZHFhzhRRnkpBZ+YRuHoRAlUji9KChpOUOndY7uykrPEPXVbHLlsdo6v5yXo/TXw=="],

"@standard-schema/spec": ["@standard-schema/[email protected]", "", {}, "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA=="],

"@types/babel__core": ["@types/[email protected]", "", { "dependencies": { "@babel/parser": "^7.20.7", "@babel/types": "^7.20.7", "@types/babel__generator": "*", "@types/babel__template": "*", "@types/babel__traverse": "*" } }, "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA=="],

"@types/babel__generator": ["@types/[email protected]", "", { "dependencies": { "@babel/types": "^7.0.0" } }, "sha512-ASsj+tpEDsEiFr1arWrlN6V3mdfjRMZt6LtK/Vp/kreFLnr5QH5+DhvD5nINYZXzwJvXeGq+05iUXcAzVrqWtw=="],
Expand Down Expand Up @@ -249,6 +264,8 @@

"acorn-jsx": ["[email protected]", "", { "peerDependencies": { "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ=="],

"ai": ["[email protected]", "", { "dependencies": { "@ai-sdk/gateway": "1.0.11", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.5", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4" } }, "sha512-RZiYhj7Ux7hrLtXkHPcxzdiSZt4NOiC69O5AkNfMCsz3twwz/KRkl9ASptosoOsg833s5yRcTSdIu5z53Sl6Pw=="],

"ajv": ["[email protected]", "", { "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", "json-schema-traverse": "^0.4.1", "uri-js": "^4.2.2" } }, "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g=="],

"ansi-styles": ["[email protected]", "", { "dependencies": { "color-convert": "^2.0.1" } }, "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg=="],
Expand Down Expand Up @@ -335,6 +352,8 @@

"esutils": ["[email protected]", "", {}, "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g=="],

"eventsource-parser": ["[email protected]", "", {}, "sha512-bSRG85ZrMdmWtm7qkF9He9TNRzc/Bm99gEJMaQoHJ9E6Kv9QBbsldh2oMj7iXmYNEAVvNgvv5vPorG6W+XtBhQ=="],

"expect-type": ["[email protected]", "", {}, "sha512-bFi65yM+xZgk+u/KRIpekdSYkTB5W1pEf0Lt8Q8Msh7b+eQ7LXVtIB1Bkm4fvclDEL1b2CZkMhv2mOeF8tMdkA=="],

"fast-deep-equal": ["[email protected]", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="],
Expand Down Expand Up @@ -391,6 +410,8 @@

"json-buffer": ["[email protected]", "", {}, "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ=="],

"json-schema": ["[email protected]", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="],

"json-schema-traverse": ["[email protected]", "", {}, "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg=="],

"json-stable-stringify-without-jsonify": ["[email protected]", "", {}, "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw=="],
Expand Down Expand Up @@ -527,6 +548,10 @@

"yocto-queue": ["[email protected]", "", {}, "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q=="],

"zod": ["[email protected]", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],

"zod-to-json-schema": ["[email protected]", "", { "peerDependencies": { "zod": "^3.24.1" } }, "sha512-h/z3PKvcTcTetyjl1fkj79MHNEjm+HpD6NXheWjzOekY7kV+lwDYnHw+ivHkijnCSMz1yJaWBD9vu/Fcmk+vEg=="],

"@babel/core/semver": ["[email protected]", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="],

"@babel/helper-compilation-targets/semver": ["[email protected]", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="],
Expand Down
3 changes: 3 additions & 0 deletions evals/000-fundamentals/008-helper_fns/grader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import {
} from "../../../grader";
import { api } from "./answer/convex/_generated/api";
import { Doc, Id } from "./answer/convex/_generated/dataModel";
import { createAIGraderTest } from "../../../grader/aiGrader";

createAIGraderTest(import.meta.url);

test("getItem and updateItem handle non-existent items", async () => {
// Try to get a non-existent item
Expand Down
4 changes: 4 additions & 0 deletions evals/001-data_modeling/007-schema_evolution/grader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ import {
import { api, internal } from "./answer/convex/_generated/api";
import { Doc } from "./answer/convex/_generated/dataModel";

import { createAIGraderTest } from "../../../grader/aiGrader";

createAIGraderTest(import.meta.url);

test("migration helper transforms data correctly", async () => {
// Insert a product with old schema format
await addDocuments(responseAdminClient, "products", [
Expand Down
3 changes: 3 additions & 0 deletions evals/001-data_modeling/009-normalize_json/grader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import {
hasIndexWithPrefix,
getSchema,
} from "../../../grader";
import { createAIGraderTest } from "../../../grader/aiGrader";

createAIGraderTest(import.meta.url);

test("organization data model works correctly", async () => {
const schema = (await getSchema(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import { expect, test } from "vitest";
import { responseAdminClient, addDocuments } from "../../../grader";
import { createAIGraderTest } from "../../../grader/aiGrader";

createAIGraderTest(import.meta.url);

test("schema validates different notification types correctly", async () => {
// Valid notifications
Expand Down
162 changes: 92 additions & 70 deletions evals/001-data_modeling/011-deconstruct_validators/grader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ import {
import { resultValidator } from "./answer/convex/schema";
import { VLiteral, VObject, VString } from "convex/values";

import { createAIGraderTest } from "../../../grader/aiGrader";

createAIGraderTest(import.meta.url);

afterAll(async () => {
await deleteAllDocuments(responseAdminClient, ["llm_calls", "api_calls"]);
});
Expand All @@ -26,11 +30,16 @@ test("resultValidator is exported as the correct type", async () => {
expect(resultValidator.members[0].fields.success.kind).toBe("literal");
expect(resultValidator.members[1].fields).toHaveProperty("success");
expect(resultValidator.members[1].fields.success.kind).toBe("literal");
let [success, error] = resultValidator.members as VObject<{ success: false; error?: string; value?: string }, {
success: VLiteral<false | true, "required">;
error?: VString<string, "required">;
value?: VString<string, "optional">;
}, "required", "success" | "error" | "value">[];
let [success, error] = resultValidator.members as VObject<
{ success: false; error?: string; value?: string },
{
success: VLiteral<false | true, "required">;
error?: VString<string, "required">;
value?: VString<string, "optional">;
},
"required",
"success" | "error" | "value"
>[];
if (success.fields.success.value !== true) {
[success, error] = [error, success];
}
Expand All @@ -40,82 +49,95 @@ test("resultValidator is exported as the correct type", async () => {
expect(error.fields.error!.kind).toBe("string");
});


test("schema validates successful results correctly", async () => {
await expect(addDocuments(responseAdminClient, "llm_calls", [
{
prompt: "What is the capital of France?",
result: {
success: true,
value: "Paris"
}
}
])).resolves.toBeUndefined();
await expect(
addDocuments(responseAdminClient, "llm_calls", [
{
prompt: "What is the capital of France?",
result: {
success: true,
value: "Paris",
},
},
]),
).resolves.toBeUndefined();

await expect(addDocuments(responseAdminClient, "api_calls", [
{
url: "https://api.example.com/data",
result: {
success: true,
value: "response data"
}
}
])).resolves.toBeUndefined();
await expect(
addDocuments(responseAdminClient, "api_calls", [
{
url: "https://api.example.com/data",
result: {
success: true,
value: "response data",
},
},
]),
).resolves.toBeUndefined();
});

test("schema validates error results correctly", async () => {
await expect(addDocuments(responseAdminClient, "llm_calls", [
{
prompt: "Invalid prompt",
result: {
success: false,
error: "Failed to process prompt"
}
}
])).resolves.toBeUndefined();
await expect(
addDocuments(responseAdminClient, "llm_calls", [
{
prompt: "Invalid prompt",
result: {
success: false,
error: "Failed to process prompt",
},
},
]),
).resolves.toBeUndefined();

await expect(addDocuments(responseAdminClient, "api_calls", [
{
url: "https://api.example.com/invalid",
result: {
success: false,
error: "404 Not Found"
}
}
])).resolves.toBeUndefined();
await expect(
addDocuments(responseAdminClient, "api_calls", [
{
url: "https://api.example.com/invalid",
result: {
success: false,
error: "404 Not Found",
},
},
]),
).resolves.toBeUndefined();
});

test("schema rejects invalid result formats", async () => {
// Missing required fields
await expect(addDocuments(responseAdminClient, "llm_calls", [
{
prompt: "test",
result: {
success: true
// missing value field
}
}
])).rejects.toThrow();
await expect(
addDocuments(responseAdminClient, "llm_calls", [
{
prompt: "test",
result: {
success: true,
// missing value field
},
},
]),
).rejects.toThrow();

// Wrong field types
await expect(addDocuments(responseAdminClient, "api_calls", [
{
url: "https://example.com",
result: {
success: false,
error: 123 // should be string
}
}
])).rejects.toThrow();
await expect(
addDocuments(responseAdminClient, "api_calls", [
{
url: "https://example.com",
result: {
success: false,
error: 123, // should be string
},
},
]),
).rejects.toThrow();

// Invalid success value
await expect(addDocuments(responseAdminClient, "llm_calls", [
{
prompt: "test",
result: {
success: "yes", // should be boolean literal
value: "test"
}
}
])).rejects.toThrow();
});
await expect(
addDocuments(responseAdminClient, "llm_calls", [
{
prompt: "test",
result: {
success: "yes", // should be boolean literal
value: "test",
},
},
]),
).rejects.toThrow();
});
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ import {
} from "../../../grader";
import { api } from "./answer/convex/_generated/api";
import { beforeEach } from "node:test";
import { createAIGraderTest } from "../../../grader/aiGrader";

createAIGraderTest(import.meta.url);

type IdOwners = string & { __tableName: "owners" };
type DogRow = {
Expand Down
3 changes: 3 additions & 0 deletions evals/002-queries/006-three_level_join/grader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ import {
listTable,
} from "../../../grader";
import { api } from "./answer/convex/_generated/api";
import { createAIGraderTest } from "../../../grader/aiGrader";

createAIGraderTest(import.meta.url);

test("compare schema", async ({ skip }) => {
await compareSchema(skip);
Expand Down
3 changes: 3 additions & 0 deletions evals/002-queries/007-aggregation/grader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ import {
addDocuments,
} from "../../../grader";
import { anyApi } from "convex/server";
import { createAIGraderTest } from "../../../grader/aiGrader";

createAIGraderTest(import.meta.url);

test("compare schema", async ({ skip }) => {
await compareSchema(skip);
Expand Down
Loading