Skip to content

Commit

Permalink
fix: update evaluator tests to match latest implementation (#34)
Browse files Browse the repository at this point in the history
* fix: update evaluator tests to match latest implementation

Co-Authored-By: Han Xiao <[email protected]>

* fix: update EvaluationResponse type and add comprehensive tests

Co-Authored-By: Han Xiao <[email protected]>

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Han Xiao <[email protected]>
  • Loading branch information
devin-ai-integration[bot] and hanxiao authored Feb 6, 2025
1 parent 0c74746 commit df99251
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 5 deletions.
64 changes: 59 additions & 5 deletions src/tools/__tests__/evaluator.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,76 @@ describe('evaluateAnswer', () => {
const { response } = await evaluateAnswer(
'What is TypeScript?',
'TypeScript is a strongly typed programming language that builds on JavaScript.',
['definitive'],
tokenTracker
);
expect(response).toHaveProperty('is_definitive');
expect(response).toHaveProperty('reasoning');
expect(response).toHaveProperty('pass');
expect(response).toHaveProperty('think');
expect(response.type).toBe('definitive');
expect(response.pass).toBe(true);
});

it('should evaluate answer freshness', async () => {
const tokenTracker = new TokenTracker();
const { response } = await evaluateAnswer(
'What is the latest version of Node.js?',
'The latest version of Node.js is 14.0.0, released in April 2020.',
['freshness'],
tokenTracker
);
expect(response).toHaveProperty('pass');
expect(response).toHaveProperty('think');
expect(response.type).toBe('freshness');
expect(response.freshness_analysis).toBeDefined();
expect(response.freshness_analysis?.likely_outdated).toBe(true);
expect(response.freshness_analysis?.dates_mentioned).toContain('2020-04');
expect(response.freshness_analysis?.current_time).toBeDefined();
expect(response.pass).toBe(false);
});

it('should evaluate answer plurality', async () => {
const tokenTracker = new TokenTracker();
const { response } = await evaluateAnswer(
'List three programming languages.',
'Python is a programming language.',
['plurality'],
tokenTracker
);
expect(response).toHaveProperty('pass');
expect(response).toHaveProperty('think');
expect(response.type).toBe('plurality');
expect(response.plurality_analysis).toBeDefined();
expect(response.plurality_analysis?.expects_multiple).toBe(true);
expect(response.plurality_analysis?.provides_multiple).toBe(false);
expect(response.plurality_analysis?.count_expected).toBe(3);
expect(response.plurality_analysis?.count_provided).toBe(1);
expect(response.pass).toBe(false);
});

it('should evaluate in order and stop at first failure', async () => {
const tokenTracker = new TokenTracker();
const { response } = await evaluateAnswer(
'List the latest Node.js versions.',
'I am not sure about the Node.js versions.',
['definitive', 'freshness', 'plurality'],
tokenTracker
);
expect(response.type).toBe('definitive');
expect(response.pass).toBe(false);
expect(response.freshness_analysis).toBeUndefined();
expect(response.plurality_analysis).toBeUndefined();
});

it('should track token usage', async () => {
const tokenTracker = new TokenTracker();
const spy = jest.spyOn(tokenTracker, 'trackUsage');
const { tokens } = await evaluateAnswer(
await evaluateAnswer(
'What is TypeScript?',
'TypeScript is a strongly typed programming language that builds on JavaScript.',
['definitive', 'freshness', 'plurality'],
tokenTracker
);
expect(spy).toHaveBeenCalledWith('evaluator', tokens);
expect(tokens).toBeGreaterThan(0);
expect(spy).toHaveBeenCalledWith('evaluator', expect.any(Number));
});
});
});
Expand Down
13 changes: 13 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,19 @@ export interface ReadResponse {
export type EvaluationResponse = {
pass: boolean;
think: string;
type?: 'definitive' | 'freshness' | 'plurality';
freshness_analysis?: {
likely_outdated: boolean;
dates_mentioned: string[];
current_time: string;
max_age_days?: number;
};
plurality_analysis?: {
expects_multiple: boolean;
provides_multiple: boolean;
count_expected?: number;
count_provided: number;
};
};

export type ErrorAnalysisResponse = {
Expand Down

0 comments on commit df99251

Please sign in to comment.