axiomhq · c-ehrlich · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -39,8 +39,8 @@ jobs:
           restore-keys: |
             ${{ runner.os }}-turbo-
       - run: pnpm install --frozen-lockfile --ignore-scripts
-      - run: pnpm build
       - run: pnpm format:check
+      - run: pnpm build
       - run: pnpm lint
       - run: pnpm typecheck
       - run: pnpm test
diff --git a/...als-nextjs/src/lib/capabilities/classify-ticket/evaluations/ticket-classification.eval.ts b/...als-nextjs/src/lib/capabilities/classify-ticket/evaluations/ticket-classification.eval.ts
@@ -1,7 +1,21 @@
-import { experimental_Eval as Eval } from 'axiom/ai/evals';
+import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals';
 import { jaccardResponseScorer, spamClassificationScorer } from '../../../scorers';
 import { classifyTicketStep } from '../../../capabilities/classify-ticket/prompts';
 import { pickFlags } from '@/lib/app-scope';
+import { ExactMatch } from 'autoevals';
+
+const WrappedExactMatch = Scorer(
+  'Exact match',
+  (args: {
+    output: { response: string; category: string };
+    expected: { response: string; category: string };
+  }) => {
+    return ExactMatch({
+      output: args.output.response,
+      expected: args.expected.response,
+    });
+  },
+);
 
 Eval('Spam classification', {
   configFlags: pickFlags('ticketClassification'),
@@ -30,7 +44,7 @@ Eval('Spam classification', {
   task: async ({ input }) => {
     return await classifyTicketStep(input);
   },
-  scorers: [spamClassificationScorer, jaccardResponseScorer],
+  scorers: [spamClassificationScorer, jaccardResponseScorer, WrappedExactMatch],
   metadata: {
     description: 'Classify support tickets as spam or not spam',
   },

diff --git a/examples/example-evals-nextjs/src/lib/scorers.ts b/examples/example-evals-nextjs/src/lib/scorers.ts
@@ -1,44 +1,26 @@
 import { Scorer } from 'axiom/ai/evals';
 import type z from 'zod';
 
-import {
-  type SupportTicketResponseSchema,
-  // type SupportTicketInputSchema,
-} from './capabilities/classify-ticket/schemas';
+import { type SupportTicketResponseSchema } from './capabilities/classify-ticket/schemas';
 
-// an example of a custom scorer
-export const exactMatchScorer = Scorer('Exact Match', ({ output, expected }) =>
-  output === expected ? 1 : 0,
+type SupportTicketResponse = z.infer<typeof SupportTicketResponseSchema>;
+
+export const exactMatchScorer = Scorer(
+  'Exact Match',
+  ({ output, expected }: { output: SupportTicketResponse; expected: SupportTicketResponse }) =>
+    output.response === expected.response ? 1 : 0,
 );
 
 export const spamClassificationScorer = Scorer(
   'Spam Classification',
-  (args: {
-    output: z.infer<typeof SupportTicketResponseSchema>;
-    expected?: z.infer<typeof SupportTicketResponseSchema>;
-  }) => {
-    const { output, expected } = args;
-    if (!expected) {
-      throw new Error('No expected value provided');
-    }
-
+  ({ output, expected }: { output: SupportTicketResponse; expected: SupportTicketResponse }) => {
     return (expected.category === 'spam') === (output.category === 'spam') ? 1 : 0;
   },
 );
 
 export const jaccardResponseScorer = Scorer(
   'Jaccard Response',
-  ({
-    output,
-    expected,
-  }: {
-    output: z.infer<typeof SupportTicketResponseSchema>;
-    expected?: z.infer<typeof SupportTicketResponseSchema>;
-  }) => {
-    if (!expected) {
-      throw new Error('No expected value provided');
-    }
-
+  ({ output, expected }: { output: SupportTicketResponse; expected: SupportTicketResponse }) => {
     const expectedTokens = new Set(expected.response.toLowerCase().split(/\s+/));
     const outputTokens = new Set(output.response.toLowerCase().split(/\s+/));
 

diff --git a/examples/example-evals-nextjs/test/feature.eval.ts b/examples/example-evals-nextjs/test/feature.eval.ts
@@ -1,4 +1,4 @@
-import { experimental_Eval as Eval } from 'axiom/ai/evals';
+import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals';
 import { flag, fact, pickFlags } from '../src/lib/app-scope';
 
 const myFn = async (input: string, expected: string) => {
@@ -13,12 +13,10 @@ const myFn = async (input: string, expected: string) => {
 };
 
 // an example of a custom scorer
-const exactMatchScorer = ({ output, expected }: { output: string; expected?: string }) => {
-  return {
-    name: 'Exact match',
-    score: output == expected ? 1 : 0,
-  };
-};
+const ExactMatchScorer = Scorer(
+  'Exact match',
+  ({ output, expected }: { output: string; expected: string }) => (output === expected ? 1 : 0),
+);
 
 Eval('Basic demo', {
   configFlags: pickFlags('behavior'),
@@ -33,11 +31,9 @@ Eval('Basic demo', {
     },
   ],
   task: async ({ input, expected }) => {
-    const r = await myFn(input, expected);
-    // console.log('tktk context', getEvalContext());
-    return r;
+    return await myFn(input, expected);
   },
-  scorers: [exactMatchScorer],
+  scorers: [ExactMatchScorer],
   metadata: {
     description:
       'Demonstrates pickFlags functionality - only behavior namespace is available, ui namespace is excluded',

diff --git a/packages/ai/README.md b/packages/ai/README.md
@@ -10,6 +10,8 @@ Axiom AI SDK provides
 npm install axiom
 ```
 
+Evals require Node 22.20 or higher.
+
 ## Model Wrapping
 
 ```ts

diff --git a/packages/ai/src/evals.ts b/packages/ai/src/evals.ts
@@ -9,7 +9,7 @@ export { AxiomReporter as experimental_AxiomReporter } from './evals/reporter';
 export { withEvalContext, getEvalContext } from './evals/context/storage';
 export type { EvalContextData } from './evals/context/storage';
 
-export { defineEval, createTypedDefineEval } from './evals/builder';
+export { defineEval } from './evals/builder';
 export type { EvalBuilder } from './evals/builder';
 export { createAppScope } from './app-scope';
 export { validateCliFlags } from './validate-flags';

diff --git a/packages/ai/src/evals/builder.ts b/packages/ai/src/evals/builder.ts
@@ -99,20 +99,3 @@ export function defineEval<
 ): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput> {
   return new EvalBuilderImpl<AllowedFlags, TInput, TExpected, TOutput>(name, params);
 }
-
-/**
- * Pre-typed defineEval for app-specific flag/fact types.
- * Created by: const defineAppEval = createTypedDefineEval<AppFlags>();
- */
-export function createTypedDefineEval<AppFlags extends Record<string, any>>() {
-  return function defineAppEval<
-    TInput extends string | Record<string, any> = string,
-    TExpected extends string | Record<string, any> = string,
-    TOutput extends string | Record<string, any> = string,
-  >(
-    name: string,
-    params: EvalParams<TInput, TExpected, TOutput>,
-  ): EvalBuilder<AppFlags, TInput, TExpected, TOutput> {
-    return defineEval<TInput, TExpected, TOutput, AppFlags>(name, params);
-  };
-}
diff --git a/packages/ai/src/evals/eval.ts b/packages/ai/src/evals/eval.ts
@@ -13,12 +13,13 @@ import type {
   EvalTask,
   InputOf,
   ExpectedOf,
+  OutputOf,
   EvaluationReport,
   EvalCaseReport,
   RuntimeFlagLog,
   OutOfScopeFlag,
 } from './eval.types';
-import type { Score, Scorer } from './scorers';
+import type { ScoreWithName, ScorerLike } from './scorers';
 import { findBaseline, findEvaluationCases } from './eval.service';
 import { getGlobalFlagOverrides, setGlobalFlagOverrides } from './context/global-flags';
 import { deepEqual } from '../util/deep-equal';
@@ -76,28 +77,27 @@ const nanoid = customAlphabet('1234567890abcdefghijklmnopqrstuvwxyz', 10);
  */
 export function Eval<
   // Inference-friendly overload – no explicit generics required by callers.
-  const Data extends readonly { input: any; expected: any }[],
-  Out extends string | Record<string, any>,
-  TaskFn extends EvalTask<InputOf<Data>, ExpectedOf<Data>, Out>,
-  In = InputOf<Data>,
-  Exp = ExpectedOf<Data>,
+  Data extends readonly CollectionRecord<any, any>[],
+  TaskFn extends (args: {
+    input: InputOf<Data>;
+    expected: ExpectedOf<Data>;
+  }) => string | Record<string, any> | Promise<string | Record<string, any>>,
 >(
   name: string,
-  params: {
+  params: Omit<
+    EvalParams<InputOf<Data>, ExpectedOf<Data>, OutputOf<TaskFn>>,
+    'data' | 'task' | 'scorers'
+  > & {
     data: () => Data | Promise<Data>;
     task: TaskFn;
-    scorers: ReadonlyArray<Scorer<In, Exp, Out>>;
-    metadata?: Record<string, unknown>;
-    timeout?: number;
-    configFlags?: string[];
+    scorers: ReadonlyArray<ScorerLike<InputOf<Data>, ExpectedOf<Data>, OutputOf<TaskFn>>>;
   },
 ): void;
 
 /**
- *
+ * Explicit generics overload – allows users to pass explicit types.
  */
 export function Eval<
-  // Explicit generics overload – allows users to pass explicit types.
   TInput extends string | Record<string, any>,
   TExpected extends string | Record<string, any>,
   TOutput extends string | Record<string, any>,
@@ -136,6 +136,13 @@ function captureFlagConfig(configFlags?: string[]): Record<string, any> {
   return dotNotationToNested(filtered);
 }
 
+const getScorerName = <TScorer extends ScorerLike<any, any, any>>(
+  scorer: TScorer,
+  fallback: string = 'unknown',
+) => {
+  return (scorer as any).name || fallback;
+};
+
 async function registerEval<
   TInput extends string | Record<string, any>,
   TExpected extends string | Record<string, any>,
@@ -144,8 +151,6 @@ async function registerEval<
   const datasetPromise = opts.data();
   const user = getGitUserInfo();
 
-  // TODO: EXPERIMENTS - we were creating `evalScope` here before
-
   // check if user passed a specific baseline id to the CLI
   const baselineId = inject('baseline');
   const isDebug = inject('debug');
@@ -297,7 +302,7 @@ async function registerEval<
 
       await it.concurrent.for(
         dataset.map((d, index) => ({ ...d, index }) satisfies CollectionRecordWithIndex),
-      )('case', async (data: CollectionRecordWithIndex, { task }) => {
+      )('case', async (data, { task }) => {
         const start = performance.now();
         if (!suiteContext) {
           throw new Error(
@@ -355,10 +360,11 @@ async function registerEval<
             overrides: result.overrides,
           };
 
-          const scoreList: Score[] = await Promise.all(
+          const scoreList: ScoreWithName[] = await Promise.all(
             opts.scorers.map(async (scorer) => {
+              const scorerName = getScorerName(scorer);
               const scorerSpan = startSpan(
-                `score ${scorer.name}`,
+                `score ${scorerName}`,
                 {
                   attributes: {
                     [Attr.GenAI.Operation.Name]: 'eval.score',
@@ -381,14 +387,15 @@ async function registerEval<
               const scoreValue = result.score as number;
 
               scorerSpan.setAttributes({
-                [Attr.Eval.Score.Name]: result.name,
+                [Attr.Eval.Score.Name]: scorerName,
                 [Attr.Eval.Score.Value]: scoreValue,
               });
 
               scorerSpan.setStatus({ code: SpanStatusCode.OK });
               scorerSpan.end();
 
               return {
+                name: scorerName,
                 ...result,
                 metadata: { duration, startedAt: start, error: null },
               };

diff --git a/packages/ai/src/evals/eval.types.ts b/packages/ai/src/evals/eval.types.ts
@@ -1,5 +1,5 @@
 import type { TaskMeta } from 'vitest';
-import type { Score, Scorer } from './scorers';
+import type { ScoreWithName, ScorerLike } from './scorers';
 
 // Type utilities for automatic inference
 /** Extract the input type from CollectionRecord[] */
@@ -14,8 +14,14 @@ export type ExpectedOf<Data extends readonly CollectionRecord<any, any>[]> =
 export type OutputOf<TaskFn extends (...args: any) => any> = TaskFn extends (
   ...args: any
 ) => AsyncIterable<infer O>
-  ? O
-  : Awaited<ReturnType<TaskFn>>;
+  ? O extends string | Record<string, any>
+    ? O
+    : never
+  : Awaited<ReturnType<TaskFn>> extends infer R
+    ? R extends string | Record<string, any>
+      ? R
+      : never
+    : never;
 
 /**
  * Function type for evaluation tasks that process input data and produce output.
@@ -86,7 +92,7 @@ export type EvalParams<
   /** The task function to evaluate */
   task: EvalTask<TInput, TExpected, TOutput>;
   /** Array of scoring functions to evaluate the task output */
-  scorers: ReadonlyArray<Scorer<TInput, TExpected, TOutput>>;
+  scorers: ReadonlyArray<ScorerLike<TInput, TExpected, TOutput>>;
   /** Optional metadata for the evaluation */
   metadata?: Record<string, unknown>;
   /** Optional timeout in milliseconds for task execution */
@@ -201,7 +207,7 @@ export type EvalCaseReport = {
   /** Expected output for comparison */
   expected: string | Record<string, any>;
   /** Array of {@link Score} results from all scorers that were run */
-  scores: Record<string, Score>;
+  scores: Record<string, ScoreWithName>;
   /** Any errors that occurred during evaluation */
   errors: Error[] | null;
   /** Status of the evaluation case */
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,6 +10,8 @@ Axiom AI SDK provides @@
     npm install axiom
     ```
+    Evals require Node 22.20 or higher.
     ## Model Wrapping
     ```ts
@@ Expand Down @@