Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ jobs:
restore-keys: |
${{ runner.os }}-turbo-
- run: pnpm install --frozen-lockfile --ignore-scripts
- run: pnpm build
- run: pnpm format:check
- run: pnpm build
- run: pnpm lint
- run: pnpm typecheck
- run: pnpm test
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
import { experimental_Eval as Eval } from 'axiom/ai/evals';
import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals';
import { jaccardResponseScorer, spamClassificationScorer } from '../../../scorers';
import { classifyTicketStep } from '../../../capabilities/classify-ticket/prompts';
import { pickFlags } from '@/lib/app-scope';
import { ExactMatch } from 'autoevals';

const WrappedExactMatch = Scorer(
'Exact match',
(args: {
output: { response: string; category: string };
expected: { response: string; category: string };
}) => {
return ExactMatch({
output: args.output.response,
expected: args.expected.response,
});
},
);

Eval('Spam classification', {
configFlags: pickFlags('ticketClassification'),
Expand Down Expand Up @@ -30,7 +44,7 @@ Eval('Spam classification', {
task: async ({ input }) => {
return await classifyTicketStep(input);
},
scorers: [spamClassificationScorer, jaccardResponseScorer],
scorers: [spamClassificationScorer, jaccardResponseScorer, WrappedExactMatch],
metadata: {
description: 'Classify support tickets as spam or not spam',
},
Expand Down
36 changes: 9 additions & 27 deletions examples/example-evals-nextjs/src/lib/scorers.ts
Original file line number Diff line number Diff line change
@@ -1,44 +1,26 @@
import { Scorer } from 'axiom/ai/evals';
import type z from 'zod';

import {
type SupportTicketResponseSchema,
// type SupportTicketInputSchema,
} from './capabilities/classify-ticket/schemas';
import { type SupportTicketResponseSchema } from './capabilities/classify-ticket/schemas';

// an example of a custom scorer
export const exactMatchScorer = Scorer('Exact Match', ({ output, expected }) =>
output === expected ? 1 : 0,
type SupportTicketResponse = z.infer<typeof SupportTicketResponseSchema>;

export const exactMatchScorer = Scorer(
'Exact Match',
({ output, expected }: { output: SupportTicketResponse; expected: SupportTicketResponse }) =>
output.response === expected.response ? 1 : 0,
);

export const spamClassificationScorer = Scorer(
'Spam Classification',
(args: {
output: z.infer<typeof SupportTicketResponseSchema>;
expected?: z.infer<typeof SupportTicketResponseSchema>;
}) => {
const { output, expected } = args;
if (!expected) {
throw new Error('No expected value provided');
}

({ output, expected }: { output: SupportTicketResponse; expected: SupportTicketResponse }) => {
return (expected.category === 'spam') === (output.category === 'spam') ? 1 : 0;
},
);

export const jaccardResponseScorer = Scorer(
'Jaccard Response',
({
output,
expected,
}: {
output: z.infer<typeof SupportTicketResponseSchema>;
expected?: z.infer<typeof SupportTicketResponseSchema>;
}) => {
if (!expected) {
throw new Error('No expected value provided');
}

({ output, expected }: { output: SupportTicketResponse; expected: SupportTicketResponse }) => {
const expectedTokens = new Set(expected.response.toLowerCase().split(/\s+/));
const outputTokens = new Set(output.response.toLowerCase().split(/\s+/));

Expand Down
18 changes: 7 additions & 11 deletions examples/example-evals-nextjs/test/feature.eval.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { experimental_Eval as Eval } from 'axiom/ai/evals';
import { experimental_Eval as Eval, Scorer } from 'axiom/ai/evals';
import { flag, fact, pickFlags } from '../src/lib/app-scope';

const myFn = async (input: string, expected: string) => {
Expand All @@ -13,12 +13,10 @@ const myFn = async (input: string, expected: string) => {
};

// an example of a custom scorer
const exactMatchScorer = ({ output, expected }: { output: string; expected?: string }) => {
return {
name: 'Exact match',
score: output == expected ? 1 : 0,
};
};
const ExactMatchScorer = Scorer(
'Exact match',
({ output, expected }: { output: string; expected: string }) => (output === expected ? 1 : 0),
);

Eval('Basic demo', {
configFlags: pickFlags('behavior'),
Expand All @@ -33,11 +31,9 @@ Eval('Basic demo', {
},
],
task: async ({ input, expected }) => {
const r = await myFn(input, expected);
// console.log('tktk context', getEvalContext());
return r;
return await myFn(input, expected);
},
scorers: [exactMatchScorer],
scorers: [ExactMatchScorer],
metadata: {
description:
'Demonstrates pickFlags functionality - only behavior namespace is available, ui namespace is excluded',
Expand Down
2 changes: 2 additions & 0 deletions packages/ai/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Axiom AI SDK provides
npm install axiom
```

Evals require Node 22.20 or higher.

## Model Wrapping

```ts
Expand Down
2 changes: 1 addition & 1 deletion packages/ai/src/evals.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export { AxiomReporter as experimental_AxiomReporter } from './evals/reporter';
export { withEvalContext, getEvalContext } from './evals/context/storage';
export type { EvalContextData } from './evals/context/storage';

export { defineEval, createTypedDefineEval } from './evals/builder';
export { defineEval } from './evals/builder';
export type { EvalBuilder } from './evals/builder';
export { createAppScope } from './app-scope';
export { validateCliFlags } from './validate-flags';
Expand Down
17 changes: 0 additions & 17 deletions packages/ai/src/evals/builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,20 +99,3 @@ export function defineEval<
): EvalBuilder<AllowedFlags, TInput, TExpected, TOutput> {
return new EvalBuilderImpl<AllowedFlags, TInput, TExpected, TOutput>(name, params);
}

/**
* Pre-typed defineEval for app-specific flag/fact types.
* Created by: const defineAppEval = createTypedDefineEval<AppFlags>();
*/
export function createTypedDefineEval<AppFlags extends Record<string, any>>() {
return function defineAppEval<
TInput extends string | Record<string, any> = string,
TExpected extends string | Record<string, any> = string,
TOutput extends string | Record<string, any> = string,
>(
name: string,
params: EvalParams<TInput, TExpected, TOutput>,
): EvalBuilder<AppFlags, TInput, TExpected, TOutput> {
return defineEval<TInput, TExpected, TOutput, AppFlags>(name, params);
};
}
45 changes: 26 additions & 19 deletions packages/ai/src/evals/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ import type {
EvalTask,
InputOf,
ExpectedOf,
OutputOf,
EvaluationReport,
EvalCaseReport,
RuntimeFlagLog,
OutOfScopeFlag,
} from './eval.types';
import type { Score, Scorer } from './scorers';
import type { ScoreWithName, ScorerLike } from './scorers';
import { findBaseline, findEvaluationCases } from './eval.service';
import { getGlobalFlagOverrides, setGlobalFlagOverrides } from './context/global-flags';
import { deepEqual } from '../util/deep-equal';
Expand Down Expand Up @@ -76,28 +77,27 @@ const nanoid = customAlphabet('1234567890abcdefghijklmnopqrstuvwxyz', 10);
*/
export function Eval<
// Inference-friendly overload – no explicit generics required by callers.
const Data extends readonly { input: any; expected: any }[],
Out extends string | Record<string, any>,
TaskFn extends EvalTask<InputOf<Data>, ExpectedOf<Data>, Out>,
In = InputOf<Data>,
Exp = ExpectedOf<Data>,
Data extends readonly CollectionRecord<any, any>[],
TaskFn extends (args: {
input: InputOf<Data>;
expected: ExpectedOf<Data>;
}) => string | Record<string, any> | Promise<string | Record<string, any>>,
>(
name: string,
params: {
params: Omit<
EvalParams<InputOf<Data>, ExpectedOf<Data>, OutputOf<TaskFn>>,
'data' | 'task' | 'scorers'
> & {
data: () => Data | Promise<Data>;
task: TaskFn;
scorers: ReadonlyArray<Scorer<In, Exp, Out>>;
metadata?: Record<string, unknown>;
timeout?: number;
configFlags?: string[];
scorers: ReadonlyArray<ScorerLike<InputOf<Data>, ExpectedOf<Data>, OutputOf<TaskFn>>>;
},
): void;

/**
*
* Explicit generics overload – allows users to pass explicit types.
*/
export function Eval<
// Explicit generics overload – allows users to pass explicit types.
TInput extends string | Record<string, any>,
TExpected extends string | Record<string, any>,
TOutput extends string | Record<string, any>,
Expand Down Expand Up @@ -136,6 +136,13 @@ function captureFlagConfig(configFlags?: string[]): Record<string, any> {
return dotNotationToNested(filtered);
}

const getScorerName = <TScorer extends ScorerLike<any, any, any>>(
scorer: TScorer,
fallback: string = 'unknown',
) => {
return (scorer as any).name || fallback;
};

async function registerEval<
TInput extends string | Record<string, any>,
TExpected extends string | Record<string, any>,
Expand All @@ -144,8 +151,6 @@ async function registerEval<
const datasetPromise = opts.data();
const user = getGitUserInfo();

// TODO: EXPERIMENTS - we were creating `evalScope` here before

// check if user passed a specific baseline id to the CLI
const baselineId = inject('baseline');
const isDebug = inject('debug');
Expand Down Expand Up @@ -297,7 +302,7 @@ async function registerEval<

await it.concurrent.for(
dataset.map((d, index) => ({ ...d, index }) satisfies CollectionRecordWithIndex),
)('case', async (data: CollectionRecordWithIndex, { task }) => {
)('case', async (data, { task }) => {
const start = performance.now();
if (!suiteContext) {
throw new Error(
Expand Down Expand Up @@ -355,10 +360,11 @@ async function registerEval<
overrides: result.overrides,
};

const scoreList: Score[] = await Promise.all(
const scoreList: ScoreWithName[] = await Promise.all(
opts.scorers.map(async (scorer) => {
const scorerName = getScorerName(scorer);
const scorerSpan = startSpan(
`score ${scorer.name}`,
`score ${scorerName}`,
{
attributes: {
[Attr.GenAI.Operation.Name]: 'eval.score',
Expand All @@ -381,14 +387,15 @@ async function registerEval<
const scoreValue = result.score as number;

scorerSpan.setAttributes({
[Attr.Eval.Score.Name]: result.name,
[Attr.Eval.Score.Name]: scorerName,
[Attr.Eval.Score.Value]: scoreValue,
});

scorerSpan.setStatus({ code: SpanStatusCode.OK });
scorerSpan.end();

return {
name: scorerName,
...result,
metadata: { duration, startedAt: start, error: null },
};
Expand Down
16 changes: 11 additions & 5 deletions packages/ai/src/evals/eval.types.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { TaskMeta } from 'vitest';
import type { Score, Scorer } from './scorers';
import type { ScoreWithName, ScorerLike } from './scorers';

// Type utilities for automatic inference
/** Extract the input type from CollectionRecord[] */
Expand All @@ -14,8 +14,14 @@ export type ExpectedOf<Data extends readonly CollectionRecord<any, any>[]> =
export type OutputOf<TaskFn extends (...args: any) => any> = TaskFn extends (
...args: any
) => AsyncIterable<infer O>
? O
: Awaited<ReturnType<TaskFn>>;
? O extends string | Record<string, any>
? O
: never
: Awaited<ReturnType<TaskFn>> extends infer R
? R extends string | Record<string, any>
? R
: never
: never;

/**
* Function type for evaluation tasks that process input data and produce output.
Expand Down Expand Up @@ -86,7 +92,7 @@ export type EvalParams<
/** The task function to evaluate */
task: EvalTask<TInput, TExpected, TOutput>;
/** Array of scoring functions to evaluate the task output */
scorers: ReadonlyArray<Scorer<TInput, TExpected, TOutput>>;
scorers: ReadonlyArray<ScorerLike<TInput, TExpected, TOutput>>;
/** Optional metadata for the evaluation */
metadata?: Record<string, unknown>;
/** Optional timeout in milliseconds for task execution */
Expand Down Expand Up @@ -201,7 +207,7 @@ export type EvalCaseReport = {
/** Expected output for comparison */
expected: string | Record<string, any>;
/** Array of {@link Score} results from all scorers that were run */
scores: Record<string, Score>;
scores: Record<string, ScoreWithName>;
/** Any errors that occurred during evaluation */
errors: Error[] | null;
/** Status of the evaluation case */
Expand Down
Loading
Loading