Skip to content

Commit

Permalink
fix: adjust to incoming api changes
Browse files Browse the repository at this point in the history
  • Loading branch information
mdonnalley committed Feb 5, 2025
1 parent d5bc66c commit b8eff72
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 178 deletions.
91 changes: 52 additions & 39 deletions src/agentTester.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import { MaybeMock } from './maybe-mock';
export type TestStatus = 'NEW' | 'IN_PROGRESS' | 'COMPLETED' | 'ERROR' | 'TERMINATED';

export type AgentTestStartResponse = {
aiEvaluationId: string;
runId: string;
status: TestStatus;
};

Expand Down Expand Up @@ -54,6 +54,7 @@ export type TestCaseResult = {
errorCode?: string;
errorMessage?: string;
}>;
testNumber: number;
};

export type AgentTestResultsResponse = {
Expand All @@ -62,10 +63,7 @@ export type AgentTestResultsResponse = {
endTime?: string;
errorMessage?: string;
subjectName: string;
testSet: {
name: string;
testCases: TestCaseResult[];
};
testCases: TestCaseResult[];
};

export type AvailableDefinition = Omit<FileProperties, 'manageableState' | 'namespacePrefix'>;
Expand Down Expand Up @@ -160,11 +158,11 @@ export class AgentTester {
const statusResponse = await this.status(jobId);
if (statusResponse.status.toLowerCase() !== 'new') {
const resultsResponse = await this.results(jobId);
const totalTestCases = resultsResponse.testSet.testCases.length;
const passingTestCases = resultsResponse.testSet.testCases.filter(
const totalTestCases = resultsResponse.testCases.length;
const passingTestCases = resultsResponse.testCases.filter(
(tc) => tc.status.toLowerCase() === 'completed' && tc.testResults.every((r) => r.result === 'PASS')
).length;
const failingTestCases = resultsResponse.testSet.testCases.filter(
const failingTestCases = resultsResponse.testCases.filter(
(tc) =>
['error', 'completed'].includes(tc.status.toLowerCase()) &&
tc.testResults.some((r) => r.result === 'FAILURE')
Expand Down Expand Up @@ -286,15 +284,15 @@ export class AgentTester {
},
expectation: [
{
name: 'expectedTopic',
name: 'topic_sequence_match',
expectedValue: tc.expectedTopic,
},
{
name: 'expectedActions',
name: 'action_sequence_match',
expectedValue: `[${tc.expectedActions.map((v) => `"${v}"`).join(',')}]`,
},
{
name: 'expectedOutcome',
name: 'bot_response_rating',
expectedValue: tc.expectedOutcome,
},
],
Expand Down Expand Up @@ -334,6 +332,19 @@ export class AgentTester {
}
}

function humanFriendlyName(name: string): string {
switch (name) {
case 'topic_sequence_match':
return 'Topic';
case 'action_sequence_match':
return 'Action';
case 'bot_response_rating':
return 'Outcome';
default:
return name;
}
}

function truncate(value: number, decimals = 2): string {
const remainder = value % 1;
// truncate remainder to specified decimals
Expand Down Expand Up @@ -402,19 +413,20 @@ export async function convertTestResultsToFormat(
}
}

async function humanFormat(details: AgentTestResultsResponse): Promise<string> {
async function humanFormat(results: AgentTestResultsResponse): Promise<string> {
const { Ux } = await import('@salesforce/sf-plugins-core');
const ux = new Ux();

const tables: string[] = [];
for (const testCase of details.testSet.testCases) {
const number = details.testSet.testCases.indexOf(testCase) + 1;
for (const testCase of results.testCases) {
const table = ux.makeTable({
title: `${ansis.bold(`Test Case #${number}`)}\n${ansis.dim('Utterance')}: ${testCase.inputs.utterance}`,
title: `${ansis.bold(`Test Case #${testCase.testNumber}`)}\n${ansis.dim('Utterance')}: ${
testCase.inputs.utterance
}`,
overflow: 'wrap',
columns: ['test', 'result', { key: 'expected', width: '40%' }, { key: 'actual', width: '40%' }],
data: testCase.testResults.map((r) => ({
test: r.name,
test: humanFriendlyName(r.name),
result: r.result === 'PASS' ? ansis.green('Pass') : ansis.red('Fail'),
expected: r.expectedValue,
actual: r.actualValue,
Expand All @@ -424,41 +436,44 @@ async function humanFormat(details: AgentTestResultsResponse): Promise<string> {
tables.push(table);
}

const topicPassCount = details.testSet.testCases.reduce((acc, tc) => {
const topicPassCount = results.testCases.reduce((acc, tc) => {
const topic = tc.testResults.find((r) => r.name === 'topic_sequence_match');
return topic?.result === 'PASS' ? acc + 1 : acc;
}, 0);
const topicPassPercent = (topicPassCount / details.testSet.testCases.length) * 100;
const topicPassPercent = (topicPassCount / results.testCases.length) * 100;

const actionPassCount = details.testSet.testCases.reduce((acc, tc) => {
const actionPassCount = results.testCases.reduce((acc, tc) => {
const action = tc.testResults.find((r) => r.name === 'action_sequence_match');
return action?.result === 'PASS' ? acc + 1 : acc;
}, 0);
const actionPassPercent = (actionPassCount / details.testSet.testCases.length) * 100;
const actionPassPercent = (actionPassCount / results.testCases.length) * 100;

const outcomePassCount = details.testSet.testCases.reduce((acc, tc) => {
const outcomePassCount = results.testCases.reduce((acc, tc) => {
const outcome = tc.testResults.find((r) => r.name === 'bot_response_rating');
return outcome?.result === 'PASS' ? acc + 1 : acc;
}, 0);
const outcomePassPercent = (outcomePassCount / details.testSet.testCases.length) * 100;
const outcomePassPercent = (outcomePassCount / results.testCases.length) * 100;

const results = {
Status: details.status,
Duration: details.endTime
? readableTime(new Date(details.endTime).getTime() - new Date(details.startTime).getTime())
const final = {
Status: results.status,
Duration: results.endTime
? readableTime(new Date(results.endTime).getTime() - new Date(results.startTime).getTime())
: 'Unknown',
'Topic Pass %': `${topicPassPercent.toFixed(2)}%`,
'Action Pass %': `${actionPassPercent.toFixed(2)}%`,
'Outcome Pass %': `${outcomePassPercent.toFixed(2)}%`,
};

const resultsTable = makeSimpleTable(results, ansis.bold.blue('Test Results'));
const resultsTable = makeSimpleTable(final, ansis.bold.blue('Test Results'));

const failedTestCases = details.testSet.testCases.filter((tc) => tc.status.toLowerCase() === 'error');
const failedTestCases = results.testCases.filter((tc) => tc.status.toLowerCase() === 'error');
const failedTestCasesObj = Object.fromEntries(
Object.entries(failedTestCases).map(([, tc]) => [
`Test Case #${failedTestCases.indexOf(tc) + 1}`,
tc.testResults.filter((r) => r.result === 'FAILURE').join(', '),
`Test Case #${tc.testNumber}`,
tc.testResults
.filter((r) => r.result === 'FAILURE')
.map((r) => humanFriendlyName(r.name))
.join(', '),
])
);
const failedTestCasesTable = makeSimpleTable(failedTestCasesObj, ansis.red.bold('Failed Test Cases'));
Expand All @@ -477,12 +492,12 @@ async function junitFormat(results: AgentTestResultsResponse): Promise<string> {
ignoreAttributes: false,
});

const testCount = results.testSet.testCases.length;
const failureCount = results.testSet.testCases.filter(
const testCount = results.testCases.length;
const failureCount = results.testCases.filter(
(tc) =>
['error', 'completed'].includes(tc.status.toLowerCase()) && tc.testResults.some((r) => r.result === 'FAILURE')
).length;
const time = results.testSet.testCases.reduce((acc, tc) => {
const time = results.testCases.reduce((acc, tc) => {
if (tc.endTime && tc.startTime) {
return acc + new Date(tc.endTime).getTime() - new Date(tc.startTime).getTime();
}
Expand All @@ -500,13 +515,13 @@ async function junitFormat(results: AgentTestResultsResponse): Promise<string> {
{ $name: 'start-time', $value: results.startTime },
{ $name: 'end-time', $value: results.endTime },
],
testsuite: results.testSet.testCases.map((testCase) => {
testsuite: results.testCases.map((testCase) => {
const testCaseTime = testCase.endTime
? new Date(testCase.endTime).getTime() - new Date(testCase.startTime).getTime()
: 0;

return {
$name: `${results.testSet.name}.${results.testSet.testCases.indexOf(testCase) + 1}`,
$name: testCase.testNumber,
$time: testCaseTime,
$assertions: testCase.testResults.length,
failure: testCase.testResults
Expand All @@ -527,13 +542,11 @@ async function junitFormat(results: AgentTestResultsResponse): Promise<string> {
async function tapFormat(results: AgentTestResultsResponse): Promise<string> {
const lines: string[] = [];
let expectationCount = 0;
for (const testCase of results.testSet.testCases) {
for (const testCase of results.testCases) {
for (const result of testCase.testResults) {
const status = result.result === 'PASS' ? 'ok' : 'not ok';
expectationCount++;
lines.push(
`${status} ${expectationCount} ${results.testSet.name}.${results.testSet.testCases.indexOf(testCase) + 1}`
);
lines.push(`${status} ${expectationCount} ${testCase.testNumber}.${result.name}`);
if (status === 'not ok') {
lines.push(' ---');
lines.push(` message: ${result.errorMessage ?? 'Unknown error'}`);
Expand Down
40 changes: 20 additions & 20 deletions test/agentTester.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ describe('AgentTester', () => {
const response = await tester.poll('4KBSM000000003F4AQ');
expect(response).to.be.ok;
// TODO: make these assertions more meaningful
expect(response.testSet.testCases[0].status).to.equal('COMPLETED');
expect(response.testCases[0].status).to.equal('COMPLETED');
});
});

Expand Down Expand Up @@ -136,15 +136,15 @@ testCases:
<utterance>List contact names associated with Acme account</utterance>
</inputs>
<expectation>
<name>expectedTopic</name>
<name>topic_sequence_match</name>
<expectedValue>GeneralCRM</expectedValue>
</expectation>
<expectation>
<name>expectedActions</name>
<name>action_sequence_match</name>
<expectedValue>[&quot;IdentifyRecordByName&quot;,&quot;QueryRecords&quot;]</expectedValue>
</expectation>
<expectation>
<name>expectedOutcome</name>
<name>bot_response_rating</name>
<expectedValue>contacts available name available with Acme are listed</expectedValue>
</expectation>
</testCase>
Expand All @@ -154,15 +154,15 @@ testCases:
<utterance>List contact emails associated with Acme account</utterance>
</inputs>
<expectation>
<name>expectedTopic</name>
<name>topic_sequence_match</name>
<expectedValue>GeneralCRM</expectedValue>
</expectation>
<expectation>
<name>expectedActions</name>
<name>action_sequence_match</name>
<expectedValue>[&quot;IdentifyRecordByName&quot;,&quot;QueryRecords&quot;]</expectedValue>
</expectation>
<expectation>
<name>expectedOutcome</name>
<name>bot_response_rating</name>
<expectedValue>contacts available emails available with Acme are listed</expectedValue>
</expectation>
</testCase>
Expand Down Expand Up @@ -191,10 +191,10 @@ describe('junit formatter', () => {
<property name="status" value="COMPLETED"></property>
<property name="start-time" value="2024-11-28T12:00:00Z"></property>
<property name="end-time" value="2024-11-28T12:00:48.56Z"></property>
<testsuite name="CRM_Sanity_v1.1" time="10000" assertions="3"></testsuite>
<testsuite name="CRM_Sanity_v1.2" time="10000" assertions="3">
<failure message="Actual response does not match the expected response" name="expectedActions"></failure>
<failure message="Actual response does not match the expected response" name="expectedOutcome"></failure>
<testsuite name="1" time="10000" assertions="3"></testsuite>
<testsuite name="2" time="10000" assertions="3">
<failure message="Actual response does not match the expected response" name="action_sequence_match"></failure>
<failure message="Actual response does not match the expected response" name="bot_response_rating"></failure>
</testsuite>
</testsuites>`);
});
Expand All @@ -205,23 +205,23 @@ describe('tap formatter', () => {
const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json', 'utf8');
const input = JSON.parse(raw) as AgentTestResultsResponse;
const output = await convertTestResultsToFormat(input, 'tap');
expect(output).to.deep.equal(`Tap Version 14
expect(output).to.equal(`Tap Version 14
1..6
ok 1 CRM_Sanity_v1.1
ok 2 CRM_Sanity_v1.1
ok 3 CRM_Sanity_v1.1
ok 4 CRM_Sanity_v1.2
not ok 5 CRM_Sanity_v1.2
ok 1 1.topic_sequence_match
ok 2 1.action_sequence_match
ok 3 1.bot_response_rating
ok 4 2.topic_sequence_match
not ok 5 2.action_sequence_match
---
message: Actual response does not match the expected response
expectation: expectedActions
expectation: action_sequence_match
actual: ["IdentifyRecordByName","QueryRecords"]
expected: ["IdentifyRecordByName","QueryRecords","GetActivitiesTimeline"]
...
not ok 6 CRM_Sanity_v1.2
not ok 6 2.bot_response_rating
---
message: Actual response does not match the expected response
expectation: expectedOutcome
expectation: bot_response_rating
actual: It looks like I am unable to find the information you are looking for due to access restrictions. How else can I assist you?
expected: Summary of open cases and activities associated with timeline
...`);
Expand Down
2 changes: 1 addition & 1 deletion test/mocks/einstein_ai-evaluations_runs.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"aiEvaluationId": "4KBSM000000003F4AQ",
"runId": "4KBSM000000003F4AQ",
"status": "NEW"
}
Loading

0 comments on commit b8eff72

Please sign in to comment.