diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 93896aaf1..83414592a 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -27,26 +27,72 @@ export type TargetT = // Data format: dataset-name, dataset_id, or examples export type DataT = string | AsyncIterable | Example[]; -// Summary evaluator runs over the whole dataset // and reports aggregate metric(s) +/** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedSyncSummaryEvaluator = ( + runs: Array, + examples: Array +) => EvaluationResult | EvaluationResults; + +/** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedAsyncSummaryEvaluator = ( + runs: Array, + examples: Array +) => Promise; + +// Summary evaluator runs over the whole dataset export type SummaryEvaluatorT = - | (( - runs: Array, - examples: Array - ) => Promise) - | (( - runs: Array, - examples: Array - ) => EvaluationResult | EvaluationResults); + | DeprecatedSyncSummaryEvaluator + | DeprecatedAsyncSummaryEvaluator + | ((args: { + runs?: Array; + examples?: Array; + inputs?: Array>; + outputs?: Array>; + referenceOutputs?: Array>; + }) => EvaluationResult | EvaluationResults) + | ((args: { + runs?: Array; + examples?: Array; + inputs?: Array>; + outputs?: Array>; + referenceOutputs?: Array>; + }) => Promise); + +/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedRunEvaluator = RunEvaluator; + +/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedFunctionEvaluator = ( + run: Run, + example?: Example +) => EvaluationResult | EvaluationResults; + +/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedAsyncFunctionEvaluator = ( + run: Run, + example?: Example +) => Promise; // Row-level evaluator export type EvaluatorT = - | RunEvaluator - | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) - | (( - run: Run, - example?: Example - ) => Promise); + | DeprecatedRunEvaluator + | DeprecatedFunctionEvaluator + | DeprecatedAsyncFunctionEvaluator + | ((args: { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => EvaluationResult | EvaluationResults) + | ((args: { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => Promise); interface _ForwardResults { run: Run; @@ -658,6 +704,7 @@ export class _ExperimentManager { for (const evaluator of wrappedEvaluators) { try { const summaryEvalResult = await evaluator(runsArray, examples); + const flattenedResults = this.client._selectEvalResults(summaryEvalResult); aggregateFeedback.push(...flattenedResults); @@ -965,6 +1012,31 @@ async function wrapSummaryEvaluators( _runs_: string, _examples_: string ): Promise => { + // Check if the evaluator expects an object parameter + if (evaluator.length === 1) { + const inputs = examples.map((ex) => ex.inputs); + const outputs = runs.map((run) => run.outputs || {}); + const referenceOutputs = examples.map((ex) => ex.outputs || {}); + + return Promise.resolve( + ( + evaluator as (args: { + runs?: Run[]; + examples?: Example[]; + inputs?: Record[]; + outputs?: Record[]; + referenceOutputs?: Record[]; + }) => EvaluationResult | EvaluationResults + )({ + runs, + examples, + inputs, + outputs, + referenceOutputs, + }) + ); + } + // Otherwise use the traditional (runs, examples) signature return Promise.resolve(evaluator(runs, examples)); }, { ...optionsArray, name: evalName } diff --git a/js/src/evaluation/evaluate_comparative.ts b/js/src/evaluation/evaluate_comparative.ts index 5a67ee9f5..5b66e1530 100644 --- a/js/src/evaluation/evaluate_comparative.ts +++ b/js/src/evaluation/evaluate_comparative.ts @@ -69,16 +69,29 @@ async function loadTraces( return results; } +/** @deprecated Use ComparativeEvaluatorNew instead: (args: { runs, example, inputs, outputs, referenceOutputs }) => ... */ +export type _ComparativeEvaluatorLegacy = ( + runs: Run[], + example: Example +) => ComparisonEvaluationResultRow | Promise; + +export type _ComparativeEvaluator = (args: { + runs?: Run[]; + example?: Example; + inputs?: Record; + outputs?: Record[]; + referenceOutputs?: Record; +}) => ComparisonEvaluationResultRow | Promise; + +export type ComparativeEvaluator = + | _ComparativeEvaluatorLegacy + | _ComparativeEvaluator; + export interface EvaluateComparativeOptions { /** * A list of evaluators to use for comparative evaluation. */ - evaluators: Array< - ( - runs: Run[], - example: Example - ) => ComparisonEvaluationResultRow | Promise - >; + evaluators: Array; /** * Randomize the order of outputs for each evaluation * @default false @@ -306,16 +319,20 @@ export async function evaluateComparative( async function evaluateAndSubmitFeedback( runs: Run[], example: Example, - evaluator: ( - runs: Run[], - example: Example - ) => ComparisonEvaluationResultRow | Promise + evaluator: ComparativeEvaluator ) { const expectedRunIds = new Set(runs.map((r) => r.id)); - const result = await evaluator( - options.randomizeOrder ? shuffle(runs) : runs, - example - ); + // Check if evaluator expects an object parameter + const result = + evaluator.length === 1 + ? await (evaluator as _ComparativeEvaluator)({ + runs: options.randomizeOrder ? shuffle(runs) : runs, + example, + inputs: example.inputs, + outputs: runs.map((run) => run.outputs || {}), + referenceOutputs: example.outputs || {}, + }) + : await (evaluator as _ComparativeEvaluatorLegacy)(runs, example); for (const [runId, score] of Object.entries(result.scores)) { // validate if the run id diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts index 92777082b..cad4707f1 100644 --- a/js/src/evaluation/evaluator.ts +++ b/js/src/evaluation/evaluator.ts @@ -95,7 +95,21 @@ export type RunEvaluatorLike = run: Run, example?: Example ) => Promise) - | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults); + | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) + | ((args: { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => EvaluationResult | EvaluationResults) + | ((args: { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => Promise); /** * Wraps an evaluator function + implements the RunEvaluator interface. @@ -110,7 +124,18 @@ export class DynamicRunEvaluator any> langSmithRunAndExample: { run: Run; example: Example }; }) => { const { run, example } = input.langSmithRunAndExample; - return evaluator(run, example); + + return evaluator( + { + ...run, + run, + example, + inputs: example?.inputs, + outputs: run?.outputs, + referenceOutputs: example?.outputs, + }, + example + ); }) as Func; } diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 1d62dd6d9..73de7246f 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -770,3 +770,346 @@ test("evaluate accepts evaluators which return multiple feedback keys", async () { key: "second-key", score: 2, comment }, ]); }); + +test("evaluate can handle evaluators with object parameters", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + const objectEvaluator = ({ + inputs, + outputs, + referenceOutputs, + }: { + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => { + return { + key: "object_evaluator", + score: outputs?.foo === referenceOutputs?.output ? 1 : 0, + comment: `Input: ${inputs?.input}, Output: ${outputs?.foo}, Expected: ${referenceOutputs?.output}`, + }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [objectEvaluator], + description: "evaluate can handle evaluators with object parameters", + }); + + expect(evalRes.results).toHaveLength(2); + + // Check first result + const firstResult = evalRes.results[0]; + expect(firstResult.evaluationResults.results).toHaveLength(1); + const firstEval = firstResult.evaluationResults.results[0]; + expect(firstEval.key).toBe("object_evaluator"); + expect(firstEval.score).toBeDefined(); + expect(firstEval.comment).toContain("Input:"); + expect(firstEval.comment).toContain("Output:"); + expect(firstEval.comment).toContain("Expected:"); + + // Check second result + const secondResult = evalRes.results[1]; + expect(secondResult.evaluationResults.results).toHaveLength(1); + const secondEval = secondResult.evaluationResults.results[0]; + expect(secondEval.key).toBe("object_evaluator"); + expect(secondEval.score).toBeDefined(); + expect(secondEval.comment).toContain("Input:"); + expect(secondEval.comment).toContain("Output:"); + expect(secondEval.comment).toContain("Expected:"); +}); + +test("evaluate can mix evaluators with different parameter styles", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + // Traditional style evaluator + const traditionalEvaluator = (run: Run, example?: Example) => { + return { + key: "traditional", + score: run.outputs?.foo === example?.outputs?.output ? 1 : 0, + }; + }; + + // Object style evaluator + const objectEvaluator = ({ + outputs, + referenceOutputs, + }: { + outputs?: Record; + referenceOutputs?: Record; + }) => { + return { + key: "object_style", + score: outputs?.foo === referenceOutputs?.output ? 1 : 0, + }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [traditionalEvaluator, objectEvaluator], + description: "evaluate can mix evaluators with different parameter styles", + }); + + expect(evalRes.results).toHaveLength(2); + + // Check both evaluators ran for each example + for (const result of evalRes.results) { + expect(result.evaluationResults.results).toHaveLength(2); + + const traditionalResult = result.evaluationResults.results.find( + (r) => r.key === "traditional" + ); + expect(traditionalResult).toBeDefined(); + expect(typeof traditionalResult?.score).toBe("number"); + + const objectResult = result.evaluationResults.results.find( + (r) => r.key === "object_style" + ); + expect(objectResult).toBeDefined(); + expect(typeof objectResult?.score).toBe("number"); + } +}); + +test("evaluate handles partial object parameters correctly", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + // Evaluator that only uses outputs and referenceOutputs + const outputOnlyEvaluator = ({ + outputs, + referenceOutputs, + }: { + outputs?: Record; + referenceOutputs?: Record; + }) => { + return { + key: "output_only", + score: outputs?.foo === referenceOutputs?.output ? 1 : 0, + }; + }; + + // Evaluator that only uses run and example + const runOnlyEvaluator = ({ + run, + example, + }: { + run?: Run; + example?: Example; + }) => { + return { + key: "run_only", + score: run?.outputs?.foo === example?.outputs?.output ? 1 : 0, + }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [outputOnlyEvaluator, runOnlyEvaluator], + description: "evaluate handles partial object parameters correctly", + }); + + expect(evalRes.results).toHaveLength(2); + + // Check both evaluators ran for each example + for (const result of evalRes.results) { + expect(result.evaluationResults.results).toHaveLength(2); + + const outputResult = result.evaluationResults.results.find( + (r) => r.key === "output_only" + ); + expect(outputResult).toBeDefined(); + expect(typeof outputResult?.score).toBe("number"); + + const runResult = result.evaluationResults.results.find( + (r) => r.key === "run_only" + ); + expect(runResult).toBeDefined(); + expect(typeof runResult?.score).toBe("number"); + } +}); + +test("evaluate handles async object-style evaluators", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + const asyncEvaluator = async ({ + outputs, + referenceOutputs, + }: { + outputs?: Record; + referenceOutputs?: Record; + }) => { + // Simulate async operation + await new Promise((resolve) => setTimeout(resolve, 10)); + return { + key: "async_evaluator", + score: outputs?.foo === referenceOutputs?.output ? 1 : 0, + }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [asyncEvaluator], + description: "evaluate handles async object-style evaluators", + }); + + expect(evalRes.results).toHaveLength(2); + + for (const result of evalRes.results) { + expect(result.evaluationResults.results).toHaveLength(1); + const evalResult = result.evaluationResults.results[0]; + expect(evalResult.key).toBe("async_evaluator"); + expect(typeof evalResult.score).toBe("number"); + } +}); + +test("evaluate can evaluate with updated summary evaluators", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + const customSummaryEvaluator = ({ + runs, + examples, + inputs, + outputs, + referenceOutputs, + }: { + runs?: Run[]; + examples?: Example[]; + inputs?: Record[]; + outputs?: Record[]; + referenceOutputs?: Record[]; + }): Promise => { + const runIds = runs?.map(({ id }) => id).join(", ") || ""; + const exampleIds = examples?.map(({ id }) => id).join(", "); + const inputValues = inputs?.map((input) => input.input).join(", "); + const outputValues = outputs?.map((output) => output.foo).join(", "); + const referenceOutputValues = referenceOutputs + ?.map((ref) => ref.output) + .join(", "); + + return Promise.resolve({ + key: "UpdatedSummaryEvaluator", + score: 1, + comment: `Runs: ${runIds} Examples: ${exampleIds} Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues}`, + }); + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + summaryEvaluators: [customSummaryEvaluator], + description: "evaluate can evaluate with updated summary evaluators", + }); + + expect(evalRes.summaryResults.results).toHaveLength(1); + expect(evalRes.summaryResults.results[0].key).toBe("UpdatedSummaryEvaluator"); + expect(evalRes.summaryResults.results[0].score).toBe(1); + + const allRuns = evalRes.results.map(({ run }) => run); + const allExamples = evalRes.results.map(({ example }) => example); + const allInputs = evalRes.results.map(({ example }) => example.inputs); + const allOutputs = evalRes.results.map(({ run }) => run.outputs); + const allReferenceOutputs = evalRes.results.map( + ({ example }) => example.outputs + ); + + const runIds = allRuns.map(({ id }) => id).join(", "); + const exampleIds = allExamples.map(({ id }) => id).join(", "); + const inputValues = allInputs.map((input) => input.input).join(", "); + const outputValues = allOutputs.map((output) => output?.foo).join(", "); + const referenceOutputValues = allReferenceOutputs + .map((ref) => ref?.output) + .join(", "); + + expect(evalRes.summaryResults.results[0].comment).toBe( + `Runs: ${runIds} Examples: ${exampleIds} Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues}` + ); +}); + +test("evaluate handles partial summary evaluator parameters correctly", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + // Summary evaluator that only uses inputs, outputs, and referenceOutputs + const outputOnlySummaryEvaluator = ({ + inputs, + outputs, + referenceOutputs, + }: { + inputs?: Record[]; + outputs?: Record[]; + referenceOutputs?: Record[]; + }): Promise => { + const inputValues = inputs?.map((input) => input.input).join(", ") || ""; + const outputValues = outputs?.map((output) => output.foo).join(", ") || ""; + const referenceOutputValues = referenceOutputs + ?.map((ref) => ref?.output) + .join(", "); + + // Calculate average difference between outputs and reference outputs + const avgDiff = + outputs?.reduce((sum, output, i) => { + return sum + Math.abs(output?.foo - referenceOutputs?.[i]?.output); + }, 0) || 0; + + return Promise.resolve({ + key: "OutputOnlySummaryEvaluator", + score: avgDiff === 0 ? 1 : 0, + comment: `Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues} AvgDiff: ${avgDiff}`, + }); + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + summaryEvaluators: [outputOnlySummaryEvaluator], + description: "evaluate handles partial summary evaluator parameters", + }); + + expect(evalRes.summaryResults.results).toHaveLength(1); + const summaryResult = evalRes.summaryResults.results[0]; + expect(summaryResult.key).toBe("OutputOnlySummaryEvaluator"); + expect(typeof summaryResult.score).toBe("number"); + + // Verify the comment contains all the expected parts + const allInputs = evalRes.results.map(({ example }) => example.inputs); + const allOutputs = evalRes.results.map(({ run }) => run.outputs); + const allReferenceOutputs = evalRes.results.map( + ({ example }) => example.outputs + ); + + const inputValues = allInputs.map((input) => input.input).join(", "); + const outputValues = allOutputs.map((output) => output?.foo).join(", "); + const referenceOutputValues = allReferenceOutputs + .map((ref) => ref?.output) + .join(", "); + + // Calculate expected average difference + const expectedAvgDiff = + allOutputs.reduce((sum, output, i) => { + return sum + Math.abs(output?.foo - allReferenceOutputs[i]?.output); + }, 0) / allOutputs.length; + + expect(summaryResult.comment).toBe( + `Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues} AvgDiff: ${expectedAvgDiff}` + ); +}); diff --git a/js/src/tests/evaluate_comparative.int.test.ts b/js/src/tests/evaluate_comparative.int.test.ts index 81c14d653..3f8520b84 100644 --- a/js/src/tests/evaluate_comparative.int.test.ts +++ b/js/src/tests/evaluate_comparative.int.test.ts @@ -1,6 +1,10 @@ import { evaluate } from "../evaluation/_runner.js"; -import { evaluateComparative } from "../evaluation/evaluate_comparative.js"; +import { + evaluateComparative, + _ComparativeEvaluator, +} from "../evaluation/evaluate_comparative.js"; import { Client } from "../index.js"; +import { Run } from "../schemas.js"; import { waitUntilRunFound } from "./utils.js"; import { v4 as uuidv4 } from "uuid"; @@ -51,9 +55,11 @@ describe("evaluate comparative", () => { [firstEval.experimentName, secondEval.experimentName], { evaluators: [ - (runs) => ({ + ({ runs }: { runs?: Run[] }) => ({ key: "latter_precedence", - scores: Object.fromEntries(runs.map((run, i) => [run.id, i % 2])), + scores: Object.fromEntries( + runs?.map((run, i) => [run.id, i % 2]) ?? [] + ), }), ], } @@ -74,9 +80,11 @@ describe("evaluate comparative", () => { ], { evaluators: [ - (runs) => ({ + ({ runs }: { runs?: Run[] }) => ({ key: "latter_precedence", - scores: Object.fromEntries(runs.map((run, i) => [run.id, i % 2])), + scores: Object.fromEntries( + runs?.map((run, i) => [run.id, i % 2]) ?? [] + ), }), ], } @@ -84,4 +92,132 @@ describe("evaluate comparative", () => { expect(pairwise.results.length).toEqual(2); }); + + describe("evaluator formats", () => { + test("old format evaluator", async () => { + const pairwise = await evaluateComparative( + [ + evaluate((input) => ({ foo: `first:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + evaluate((input) => ({ foo: `second:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + ], + { + evaluators: [ + // Old format evaluator + (runs, example) => ({ + key: "old_format", + scores: Object.fromEntries( + runs.map((run) => [ + run.id, + run.outputs?.foo === `second:${example.inputs.input}` ? 1 : 0, + ]) + ), + }), + ], + } + ); + + expect(pairwise.results.length).toEqual(2); + expect(pairwise.results[0].key).toBe("old_format"); + // Second run in each pair should have score of 1 + expect(Object.values(pairwise.results[0].scores)).toEqual([0, 1]); + }); + + test("new format evaluator", async () => { + const matchesSecondEvaluator: _ComparativeEvaluator = ({ + runs, + inputs, + outputs, + }: { + runs?: Run[]; + inputs?: Record; + outputs?: Record[]; + }) => ({ + key: "new_format", + scores: Object.fromEntries( + // Add null checks for the optional parameters + runs?.map((run, i) => [ + run.id, + outputs?.[i]?.foo === `second:${inputs?.input}` ? 1 : 0, + ]) ?? [] + ), + }); + + const pairwise = await evaluateComparative( + [ + evaluate((input) => ({ foo: `first:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + evaluate((input) => ({ foo: `second:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + ], + { + evaluators: [matchesSecondEvaluator], + } + ); + + expect(pairwise.results.length).toEqual(2); + expect(pairwise.results[0].key).toBe("new_format"); + // Second run in each pair should have score of 1 + expect(Object.values(pairwise.results[0].scores)).toEqual([0, 1]); + }); + + test("mixed old and new format evaluators", async () => { + const matchesSecondEvaluator: _ComparativeEvaluator = ({ + runs, + inputs, + outputs, + }: { + runs?: Run[]; + inputs?: Record; + outputs?: Record[]; + }) => ({ + key: "new_format", + scores: Object.fromEntries( + runs?.map((run, i) => [ + run.id, + outputs?.[i]?.foo === `second:${inputs?.input}` ? 1 : 0, + ]) ?? [] + ), + }); + const pairwise = await evaluateComparative( + [ + evaluate((input) => ({ foo: `first:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + evaluate((input) => ({ foo: `second:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + ], + { + evaluators: [ + // Old format + (runs, example) => ({ + key: "old_format", + scores: Object.fromEntries( + runs.map((run) => [ + run.id, + run.outputs?.foo === `second:${example.inputs.input}` ? 1 : 0, + ]) + ), + }), + // New format + matchesSecondEvaluator, + ], + } + ); + + expect(pairwise.results.length).toEqual(4); // 2 examples × 2 evaluators + expect(pairwise.results.map((r) => r.key)).toContain("old_format"); + expect(pairwise.results.map((r) => r.key)).toContain("new_format"); + // Each evaluator should score the second run as 1 + pairwise.results.forEach((result) => { + expect(Object.values(result.scores)).toEqual([0, 1]); + }); + }); + }); });