Add evaluation group and refactor evaluations (#20)

* tmp * Update comment * Update README * tmp * Bump version
lmnr-ai · Oct 4, 2024 · ad3c75f · ad3c75f
1 parent 3acdc3d
commit ad3c75f
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 121 deletions.
diff --git a/README.md b/README.md
@@ -120,7 +120,8 @@ evaluate({
     executor: (data) => writePoem(data),
     evaluators: {
         containsPoem: (output, target) => target.poem.includes(output) ? 1 : 0
-    }
+    },
+    groupId: 'my_first_feature'
 })
 ```
 

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@lmnr-ai/lmnr",
-  "version": "0.4.12",
+  "version": "0.4.13",
   "description": "TypeScript SDK for Laminar AI",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",

diff --git a/src/evaluations.ts b/src/evaluations.ts
@@ -1,7 +1,7 @@
 import { Laminar } from "./laminar";
 import { EvaluationDatapoint } from "./types";
 import cliProgress from "cli-progress";
-import { isNumber, otelTraceIdToUUID, StringUUID } from "./utils";
+import { isNumber, otelTraceIdToUUID } from "./utils";
 import { observe } from "./decorators";
 import { trace } from "@opentelemetry/api";
 import { SPAN_TYPE } from "./sdk/tracing/attributes";
@@ -19,6 +19,26 @@ const getEvaluationUrl = (projectId: string, evaluationId: string) => {
     return `https://www.lmnr.ai/project/${projectId}/evaluations/${evaluationId}`;
 }
 
+const getAverageScores = (results: EvaluationDatapoint<any, any, any>[]): Record<string, number> => {
+    const perScoreValues: Record<string, number[]> = {};
+    for (const result of results) {
+        for (const key in result.scores) {
+            if (perScoreValues[key]) {
+                perScoreValues[key].push(result.scores[key]);
+            } else {
+                perScoreValues[key] = [result.scores[key]];
+            }
+        }
+    }
+
+    const averageScores: Record<string, number> = {};
+    for (const key in perScoreValues) {
+        averageScores[key] = perScoreValues[key].reduce((a, b) => a + b, 0) / perScoreValues[key].length;
+    }
+
+    return averageScores;
+}
+
 /**
  * Configuration for the Evaluator
  */
@@ -83,6 +103,10 @@ interface EvaluationConstructorProps<D, T, O> {
      * `evaluator_${index}`, where index is the index of the evaluator function in the list starting from 1.
      */
     evaluators: Record<string, EvaluatorFunction<O, T>>;
+    /**
+     * Optional group id of the evaluation. Defaults to "default".
+     */
+    groupId?: string;
     /**
      * Name of the evaluation.
      */
@@ -102,9 +126,7 @@ class EvaluationReporter {
 
     constructor() {}
 
-    public start({name, projectId, id, length}: {name: string, projectId: string, id: string, length: number}) {
-        process.stdout.write(`\nRunning evaluation ${name}...\n\n`);
-        process.stdout.write(`Check progress and results at ${getEvaluationUrl(projectId, id)}\n\n`);
+    public start({length}: {length: number}) {
         this.cliProgress.start(length, 0);
     }
 
@@ -120,96 +142,85 @@ class EvaluationReporter {
     }
 
     // Call either error or stop, not both
-    public stop(averageScores: Record<string, number>) {
+    public stop({averageScores, projectId, evaluationId}: {averageScores: Record<string, number>, projectId: string, evaluationId: string}) {
         this.cliProgress.stop();
+        process.stdout.write(`\nCheck progress and results at ${getEvaluationUrl(projectId, evaluationId)}\n`);
         process.stdout.write('\nAverage scores:\n');
         for (const key in averageScores) {
-            process.stdout.write(`${key}: ${averageScores[key]}\n`);
+            process.stdout.write(`${key}: ${JSON.stringify(averageScores[key])}\n`);
         }
         process.stdout.write('\n');
     }
 }
 
 class Evaluation<D, T, O> {
     private isFinished: boolean = false;
-    private name?: string;
     private progressReporter: EvaluationReporter;
     private data: Datapoint<D, T>[] | Dataset<D, T>;
     private executor: (data: D, ...args: any[]) => O | Promise<O>;
     private evaluators: Record<string, EvaluatorFunction<O, T>>;
+    private groupId?: string;
+    private name?: string;
     private batchSize: number = DEFAULT_BATCH_SIZE;
 
-    /**
-     * Create a new evaluation and prepare data.
-     * @param props.data List of data points to evaluate. `data` is the input to the executor function, `target` is the input to the evaluator function.
-     * @param props.executor The executor function. Takes the data point + any additional arguments and returns the output to evaluate.
-     * @param props.evaluators Map from evaluator name to evaluator function. Each evaluator function takes the output of the executor and the target data, and returns.
-     * @param props.name Optional name of the evaluation.
-     * @param props.config Optional override configurations for the evaluator.
-     */
     constructor({
-        data, executor, evaluators, name, config
+        data, executor, evaluators, groupId, name, config
     }: EvaluationConstructorProps<D, T, O>) {
-        this.name = name;
+        if (Object.keys(evaluators).length === 0) {
+            throw new Error('No evaluators provided');
+        }
+
+        // Validate evaluator keys
+        for (const key in evaluators) {
+            if (!/^[\w\s-]+$/.test(key)) {
+                throw new Error(`Invalid evaluator key: "${key}". Keys must only contain letters, digits, hyphens, underscores, or spaces.`);
+            }
+        }
+
         this.progressReporter = new EvaluationReporter();
         this.data = data;
         this.executor = executor;
         this.evaluators = evaluators;
+        this.groupId = groupId;
+        this.name = name;
         if (config) {
             this.batchSize = config.batchSize ?? DEFAULT_BATCH_SIZE;
         }
         Laminar.initialize({ projectApiKey: config?.projectApiKey, baseUrl: config?.baseUrl, httpPort: config?.httpPort, grpcPort: config?.grpcPort, instrumentModules: config?.instrumentModules });
     }
 
-    /** 
-     * Runs the evaluation.
-     *
-     * Creates a new evaluation.
-     * Evaluates data points in batches of `batchSize`. The executor function is called on each data point
-     * to get the output, and then evaluate it by each evaluator function.
-     */
     public async run(): Promise<void> {
         if (this.isFinished) {
             throw new Error('Evaluation is already finished');
         }
 
-        const evaluation = await Laminar.createEvaluation(this.name);
-        this.progressReporter.start({name: evaluation.name, projectId: evaluation.projectId, id: evaluation.id, length: this.getLength()});
+        this.progressReporter.start({length: this.getLength()});
+        let resultDatapoints: EvaluationDatapoint<D, T, O>[];
         try {
-            await this.evaluateInBatches(evaluation.id);
+            resultDatapoints = await this.evaluateInBatches();
         } catch (e) {
-            await Laminar.updateEvaluationStatus(evaluation.id, 'Error');
             this.progressReporter.stopWithError(e as Error);
             this.isFinished = true;
             return;
         }
 
-        // If we update with status "Finished", we expect averageScores to be not empty
-        const updatedEvaluation = await Laminar.updateEvaluationStatus(evaluation.id, 'Finished');
-        const averageScores = updatedEvaluation.stats.averageScores;
-        this.progressReporter.stop(averageScores);
+        const evaluation = await Laminar.createEvaluation({groupId: this.groupId, name: this.name, data: resultDatapoints});
+        const averageScores = getAverageScores(resultDatapoints);
+        this.progressReporter.stop({averageScores, projectId: evaluation.projectId, evaluationId: evaluation.id});
         this.isFinished = true;
 
         await Laminar.shutdown();
     }
 
-    // TODO: Calculate duration of the evaluation and add it to the summary
-    public async evaluateInBatches(evaluationId: StringUUID): Promise<void> {
+    public async evaluateInBatches(): Promise<EvaluationDatapoint<D, T, O>[]> {
+        const resultDatapoints: EvaluationDatapoint<D, T, O>[] = [];
         for (let i = 0; i < this.getLength(); i += this.batchSize) {
             const batch = this.data.slice(i, i + this.batchSize);
-            try {
-                const results = await this.evaluateBatch(batch);
-
-                // TODO: This must happen on the background, while the next batch is being evaluated
-                // If we do this, then we can calculate the duration of the evaluation and add it to the summary
-                await Laminar.postEvaluationResults(evaluationId, results);
-            } catch (e) {
-                console.error(`Error evaluating batch: ${e}`);
-            } finally {
-                // Update progress regardless of success
-                this.progressReporter.update(batch.length);
-            }
+            const batchDatapoints = await this.evaluateBatch(batch);
+            resultDatapoints.push(...batchDatapoints);
+            this.progressReporter.update(batch.length);
         }
+        return resultDatapoints;
     }
 
     private async evaluateBatch(batch: Datapoint<D, T>[]): Promise<EvaluationDatapoint<D, T, O>[]> {
@@ -233,14 +244,12 @@ class Evaluation<D, T, O> {
                         return await evaluator(output, target);
                     }, output, target);
 
-                    // If the evaluator returns a single number, use the evaluator name as the key
                     if (isNumber(value)) {
                         if (isNaN(value)) {
                             throw new Error(`Evaluator ${evaluatorName} returned NaN`);
                         }
                         scores[evaluatorName] = value;
                     } else {
-                        // If the evaluator returns an object, merge its keys with the existing scores (flatten)
                         scores = { ...scores, ...value };
                     }
                 }
@@ -275,13 +284,14 @@ class Evaluation<D, T, O> {
  * @param props.data List of data points to evaluate. `data` is the input to the executor function, `target` is the input to the evaluator function.
  * @param props.executor The executor function. Takes the data point + any additional arguments and returns the output to evaluate.
  * @param props.evaluators Map from evaluator name to evaluator function. Each evaluator function takes the output of the executor and the target data, and returns.
- * @param props.name Optional name of the evaluation.
+ * @param props.groupId Group name which is same as the feature you are evaluating in your project or application. Defaults to "default".
+ * @param props.name Optional name of the evaluation. Used to easily identify the evaluation in the group.
  * @param props.config Optional override configurations for the evaluator.
  */
 export async function evaluate<D, T, O>({
-    data, executor, evaluators, name, config
+    data, executor, evaluators, groupId, name, config
 }: EvaluationConstructorProps<D, T, O>): Promise<void> {
-    const evaluation = new Evaluation({ data, executor, evaluators, name, config });
+    const evaluation = new Evaluation({ data, executor, evaluators, name, groupId, config });
     if (globalThis._set_global_evaluation) {
         globalThis._evaluation = evaluation;
     } else {

diff --git a/src/laminar.ts b/src/laminar.ts
@@ -1,4 +1,4 @@
-import { PipelineRunResponse, PipelineRunRequest, EvaluationDatapoint, EvaluationStatus, UpdateEvaluationResponse, CreateEvaluationResponse } from './types';
+import { PipelineRunResponse, PipelineRunRequest, EvaluationDatapoint, CreateEvaluationResponse } from './types';
 import { Attributes, AttributeValue, context, isSpanContextValid, TimeInput, trace } from '@opentelemetry/api';
 import { InitializeOptions, initialize as traceloopInitialize } from './sdk/node-server-sdk'
 import { otelSpanIdToUUID, otelTraceIdToUUID, StringUUID } from './utils';
@@ -319,73 +319,22 @@ export class Laminar {
         await forceFlush();
     }
 
-    public static async createEvaluation(name?: string): Promise<CreateEvaluationResponse> {
+    public static async createEvaluation<D, T, O>({groupId, name, data}: {groupId?: string, name?: string, data: EvaluationDatapoint<D, T, O>[]}): Promise<CreateEvaluationResponse> {
         const response = await fetch(`${this.baseHttpUrl}/v1/evaluations`, {
             method: 'POST',
             headers: this.getHeaders(),
             body: JSON.stringify({
+                groupId: groupId ?? null,
                 name: name ?? null,
+                points: data,
             })
         });
 
         if (!response.ok) {
             throw new Error(`Failed to create evaluation ${name}. Response: ${response.statusText}`);
         }
 
-        return await response.json();
-    }
-
-    public static async postEvaluationResults<D, T, O>(
-        evaluationId: StringUUID,
-        data: EvaluationDatapoint<D, T, O>[]
-    ): Promise<void> {
-        const body = JSON.stringify({
-            evaluationId,
-            points: data,
-        });
-        const headers = this.getHeaders();
-        const url = `${this.baseHttpUrl}/v1/evaluation-datapoints`;
-        try {
-            const response = await fetch(url, {
-                method: "POST",
-                headers,
-                body,
-            });
-            if (!response.ok) {
-                console.error("Failed to send evaluation results. Response: ");
-                response.text().then((text) => console.error(text));
-            }
-        } catch (error) {
-            console.error("Failed to send evaluation results. Error: ", error);
-        };
-    }
-
-    /**
-     * Updates the status of an evaluation. Returns the updated evaluation object.
-     * 
-     * @param evaluationId - The ID of the evaluation to update.
-     * @param status - The status to set for the evaluation.
-     */
-    public static async updateEvaluationStatus(
-        evaluationId: string,
-        status: EvaluationStatus,
-    ): Promise<UpdateEvaluationResponse> {
-        const body = JSON.stringify({
-            status: status,
-        });
-        const headers = this.getHeaders();
-        const url = `${this.baseHttpUrl}/v1/evaluations/${evaluationId}`;
-
-        const response = await fetch(url, {
-            method: "POST",
-            headers,
-            body,
-        });
-        if (!response.ok) {
-            throw new Error(`Failed to update evaluation status ${evaluationId}. Response: ${await response.text()}`);
-        }
-
-        return await response.json() as UpdateEvaluationResponse;
+        return await response.json() as CreateEvaluationResponse;
     }
 
     private static getHeaders() {

diff --git a/src/types.ts b/src/types.ts
@@ -40,20 +40,11 @@ export type Event = {
 export type CreateEvaluationResponse = {
     id: StringUUID,
     createdAt: Date,
+    groupId: string,
     name: string,
-    status: EvaluationStatus
     projectId: StringUUID,
-    metadata: Record<string, any> | null,
 }
 
-type EvaluationStats = {
-    averageScores: Record<string, number>;
-}
-
-export type UpdateEvaluationResponse = {
-    stats: EvaluationStats;
-};
-
 export type EvaluationDatapoint<D, T, O> = {
     data: Record<string, any> & D;
     target: Record<string, any> & T;
@@ -62,8 +53,6 @@ export type EvaluationDatapoint<D, T, O> = {
     traceId: string;
 }
 
-export type EvaluationStatus = 'Started' | 'Finished' | 'Error';
-
 /**
  * Span types to categorize spans.
  *