Skip to content

Commit

Permalink
Add evaluation group and refactor evaluations (#20)
Browse files Browse the repository at this point in the history
* tmp

* Update comment

* Update README

* tmp

* Bump version
  • Loading branch information
temirrr authored Oct 4, 2024
1 parent 3acdc3d commit ad3c75f
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 121 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,8 @@ evaluate({
executor: (data) => writePoem(data),
evaluators: {
containsPoem: (output, target) => target.poem.includes(output) ? 1 : 0
}
},
groupId: 'my_first_feature'
})
```

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@lmnr-ai/lmnr",
"version": "0.4.12",
"version": "0.4.13",
"description": "TypeScript SDK for Laminar AI",
"main": "dist/index.js",
"types": "dist/index.d.ts",
Expand Down
112 changes: 61 additions & 51 deletions src/evaluations.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { Laminar } from "./laminar";
import { EvaluationDatapoint } from "./types";
import cliProgress from "cli-progress";
import { isNumber, otelTraceIdToUUID, StringUUID } from "./utils";
import { isNumber, otelTraceIdToUUID } from "./utils";
import { observe } from "./decorators";
import { trace } from "@opentelemetry/api";
import { SPAN_TYPE } from "./sdk/tracing/attributes";
Expand All @@ -19,6 +19,26 @@ const getEvaluationUrl = (projectId: string, evaluationId: string) => {
return `https://www.lmnr.ai/project/${projectId}/evaluations/${evaluationId}`;
}

const getAverageScores = (results: EvaluationDatapoint<any, any, any>[]): Record<string, number> => {
const perScoreValues: Record<string, number[]> = {};
for (const result of results) {
for (const key in result.scores) {
if (perScoreValues[key]) {
perScoreValues[key].push(result.scores[key]);
} else {
perScoreValues[key] = [result.scores[key]];
}
}
}

const averageScores: Record<string, number> = {};
for (const key in perScoreValues) {
averageScores[key] = perScoreValues[key].reduce((a, b) => a + b, 0) / perScoreValues[key].length;
}

return averageScores;
}

/**
* Configuration for the Evaluator
*/
Expand Down Expand Up @@ -83,6 +103,10 @@ interface EvaluationConstructorProps<D, T, O> {
* `evaluator_${index}`, where index is the index of the evaluator function in the list starting from 1.
*/
evaluators: Record<string, EvaluatorFunction<O, T>>;
/**
* Optional group id of the evaluation. Defaults to "default".
*/
groupId?: string;
/**
* Name of the evaluation.
*/
Expand All @@ -102,9 +126,7 @@ class EvaluationReporter {

constructor() {}

public start({name, projectId, id, length}: {name: string, projectId: string, id: string, length: number}) {
process.stdout.write(`\nRunning evaluation ${name}...\n\n`);
process.stdout.write(`Check progress and results at ${getEvaluationUrl(projectId, id)}\n\n`);
public start({length}: {length: number}) {
this.cliProgress.start(length, 0);
}

Expand All @@ -120,96 +142,85 @@ class EvaluationReporter {
}

// Call either error or stop, not both
public stop(averageScores: Record<string, number>) {
public stop({averageScores, projectId, evaluationId}: {averageScores: Record<string, number>, projectId: string, evaluationId: string}) {
this.cliProgress.stop();
process.stdout.write(`\nCheck progress and results at ${getEvaluationUrl(projectId, evaluationId)}\n`);
process.stdout.write('\nAverage scores:\n');
for (const key in averageScores) {
process.stdout.write(`${key}: ${averageScores[key]}\n`);
process.stdout.write(`${key}: ${JSON.stringify(averageScores[key])}\n`);
}
process.stdout.write('\n');
}
}

class Evaluation<D, T, O> {
private isFinished: boolean = false;
private name?: string;
private progressReporter: EvaluationReporter;
private data: Datapoint<D, T>[] | Dataset<D, T>;
private executor: (data: D, ...args: any[]) => O | Promise<O>;
private evaluators: Record<string, EvaluatorFunction<O, T>>;
private groupId?: string;
private name?: string;
private batchSize: number = DEFAULT_BATCH_SIZE;

/**
* Create a new evaluation and prepare data.
* @param props.data List of data points to evaluate. `data` is the input to the executor function, `target` is the input to the evaluator function.
* @param props.executor The executor function. Takes the data point + any additional arguments and returns the output to evaluate.
* @param props.evaluators Map from evaluator name to evaluator function. Each evaluator function takes the output of the executor and the target data, and returns.
* @param props.name Optional name of the evaluation.
* @param props.config Optional override configurations for the evaluator.
*/
constructor({
data, executor, evaluators, name, config
data, executor, evaluators, groupId, name, config
}: EvaluationConstructorProps<D, T, O>) {
this.name = name;
if (Object.keys(evaluators).length === 0) {
throw new Error('No evaluators provided');
}

// Validate evaluator keys
for (const key in evaluators) {
if (!/^[\w\s-]+$/.test(key)) {
throw new Error(`Invalid evaluator key: "${key}". Keys must only contain letters, digits, hyphens, underscores, or spaces.`);
}
}

this.progressReporter = new EvaluationReporter();
this.data = data;
this.executor = executor;
this.evaluators = evaluators;
this.groupId = groupId;
this.name = name;
if (config) {
this.batchSize = config.batchSize ?? DEFAULT_BATCH_SIZE;
}
Laminar.initialize({ projectApiKey: config?.projectApiKey, baseUrl: config?.baseUrl, httpPort: config?.httpPort, grpcPort: config?.grpcPort, instrumentModules: config?.instrumentModules });
}

/**
* Runs the evaluation.
*
* Creates a new evaluation.
* Evaluates data points in batches of `batchSize`. The executor function is called on each data point
* to get the output, and then evaluate it by each evaluator function.
*/
public async run(): Promise<void> {
if (this.isFinished) {
throw new Error('Evaluation is already finished');
}

const evaluation = await Laminar.createEvaluation(this.name);
this.progressReporter.start({name: evaluation.name, projectId: evaluation.projectId, id: evaluation.id, length: this.getLength()});
this.progressReporter.start({length: this.getLength()});
let resultDatapoints: EvaluationDatapoint<D, T, O>[];
try {
await this.evaluateInBatches(evaluation.id);
resultDatapoints = await this.evaluateInBatches();
} catch (e) {
await Laminar.updateEvaluationStatus(evaluation.id, 'Error');
this.progressReporter.stopWithError(e as Error);
this.isFinished = true;
return;
}

// If we update with status "Finished", we expect averageScores to be not empty
const updatedEvaluation = await Laminar.updateEvaluationStatus(evaluation.id, 'Finished');
const averageScores = updatedEvaluation.stats.averageScores;
this.progressReporter.stop(averageScores);
const evaluation = await Laminar.createEvaluation({groupId: this.groupId, name: this.name, data: resultDatapoints});
const averageScores = getAverageScores(resultDatapoints);
this.progressReporter.stop({averageScores, projectId: evaluation.projectId, evaluationId: evaluation.id});
this.isFinished = true;

await Laminar.shutdown();
}

// TODO: Calculate duration of the evaluation and add it to the summary
public async evaluateInBatches(evaluationId: StringUUID): Promise<void> {
public async evaluateInBatches(): Promise<EvaluationDatapoint<D, T, O>[]> {
const resultDatapoints: EvaluationDatapoint<D, T, O>[] = [];
for (let i = 0; i < this.getLength(); i += this.batchSize) {
const batch = this.data.slice(i, i + this.batchSize);
try {
const results = await this.evaluateBatch(batch);

// TODO: This must happen on the background, while the next batch is being evaluated
// If we do this, then we can calculate the duration of the evaluation and add it to the summary
await Laminar.postEvaluationResults(evaluationId, results);
} catch (e) {
console.error(`Error evaluating batch: ${e}`);
} finally {
// Update progress regardless of success
this.progressReporter.update(batch.length);
}
const batchDatapoints = await this.evaluateBatch(batch);
resultDatapoints.push(...batchDatapoints);
this.progressReporter.update(batch.length);
}
return resultDatapoints;
}

private async evaluateBatch(batch: Datapoint<D, T>[]): Promise<EvaluationDatapoint<D, T, O>[]> {
Expand All @@ -233,14 +244,12 @@ class Evaluation<D, T, O> {
return await evaluator(output, target);
}, output, target);

// If the evaluator returns a single number, use the evaluator name as the key
if (isNumber(value)) {
if (isNaN(value)) {
throw new Error(`Evaluator ${evaluatorName} returned NaN`);
}
scores[evaluatorName] = value;
} else {
// If the evaluator returns an object, merge its keys with the existing scores (flatten)
scores = { ...scores, ...value };
}
}
Expand Down Expand Up @@ -275,13 +284,14 @@ class Evaluation<D, T, O> {
* @param props.data List of data points to evaluate. `data` is the input to the executor function, `target` is the input to the evaluator function.
* @param props.executor The executor function. Takes the data point + any additional arguments and returns the output to evaluate.
* @param props.evaluators Map from evaluator name to evaluator function. Each evaluator function takes the output of the executor and the target data, and returns.
* @param props.name Optional name of the evaluation.
* @param props.groupId Group name which is same as the feature you are evaluating in your project or application. Defaults to "default".
* @param props.name Optional name of the evaluation. Used to easily identify the evaluation in the group.
* @param props.config Optional override configurations for the evaluator.
*/
export async function evaluate<D, T, O>({
data, executor, evaluators, name, config
data, executor, evaluators, groupId, name, config
}: EvaluationConstructorProps<D, T, O>): Promise<void> {
const evaluation = new Evaluation({ data, executor, evaluators, name, config });
const evaluation = new Evaluation({ data, executor, evaluators, name, groupId, config });
if (globalThis._set_global_evaluation) {
globalThis._evaluation = evaluation;
} else {
Expand Down
61 changes: 5 additions & 56 deletions src/laminar.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { PipelineRunResponse, PipelineRunRequest, EvaluationDatapoint, EvaluationStatus, UpdateEvaluationResponse, CreateEvaluationResponse } from './types';
import { PipelineRunResponse, PipelineRunRequest, EvaluationDatapoint, CreateEvaluationResponse } from './types';
import { Attributes, AttributeValue, context, isSpanContextValid, TimeInput, trace } from '@opentelemetry/api';
import { InitializeOptions, initialize as traceloopInitialize } from './sdk/node-server-sdk'
import { otelSpanIdToUUID, otelTraceIdToUUID, StringUUID } from './utils';
Expand Down Expand Up @@ -319,73 +319,22 @@ export class Laminar {
await forceFlush();
}

public static async createEvaluation(name?: string): Promise<CreateEvaluationResponse> {
public static async createEvaluation<D, T, O>({groupId, name, data}: {groupId?: string, name?: string, data: EvaluationDatapoint<D, T, O>[]}): Promise<CreateEvaluationResponse> {
const response = await fetch(`${this.baseHttpUrl}/v1/evaluations`, {
method: 'POST',
headers: this.getHeaders(),
body: JSON.stringify({
groupId: groupId ?? null,
name: name ?? null,
points: data,
})
});

if (!response.ok) {
throw new Error(`Failed to create evaluation ${name}. Response: ${response.statusText}`);
}

return await response.json();
}

public static async postEvaluationResults<D, T, O>(
evaluationId: StringUUID,
data: EvaluationDatapoint<D, T, O>[]
): Promise<void> {
const body = JSON.stringify({
evaluationId,
points: data,
});
const headers = this.getHeaders();
const url = `${this.baseHttpUrl}/v1/evaluation-datapoints`;
try {
const response = await fetch(url, {
method: "POST",
headers,
body,
});
if (!response.ok) {
console.error("Failed to send evaluation results. Response: ");
response.text().then((text) => console.error(text));
}
} catch (error) {
console.error("Failed to send evaluation results. Error: ", error);
};
}

/**
* Updates the status of an evaluation. Returns the updated evaluation object.
*
* @param evaluationId - The ID of the evaluation to update.
* @param status - The status to set for the evaluation.
*/
public static async updateEvaluationStatus(
evaluationId: string,
status: EvaluationStatus,
): Promise<UpdateEvaluationResponse> {
const body = JSON.stringify({
status: status,
});
const headers = this.getHeaders();
const url = `${this.baseHttpUrl}/v1/evaluations/${evaluationId}`;

const response = await fetch(url, {
method: "POST",
headers,
body,
});
if (!response.ok) {
throw new Error(`Failed to update evaluation status ${evaluationId}. Response: ${await response.text()}`);
}

return await response.json() as UpdateEvaluationResponse;
return await response.json() as CreateEvaluationResponse;
}

private static getHeaders() {
Expand Down
13 changes: 1 addition & 12 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,11 @@ export type Event = {
export type CreateEvaluationResponse = {
id: StringUUID,
createdAt: Date,
groupId: string,
name: string,
status: EvaluationStatus
projectId: StringUUID,
metadata: Record<string, any> | null,
}

type EvaluationStats = {
averageScores: Record<string, number>;
}

export type UpdateEvaluationResponse = {
stats: EvaluationStats;
};

export type EvaluationDatapoint<D, T, O> = {
data: Record<string, any> & D;
target: Record<string, any> & T;
Expand All @@ -62,8 +53,6 @@ export type EvaluationDatapoint<D, T, O> = {
traceId: string;
}

export type EvaluationStatus = 'Started' | 'Finished' | 'Error';

/**
* Span types to categorize spans.
*
Expand Down

0 comments on commit ad3c75f

Please sign in to comment.