From d076dce33c2ee2fea9d06418e083e2406ffc665c Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 26 Nov 2024 20:47:59 -0800 Subject: [PATCH 01/13] js[patch]: simple evaluator args --- js/src/evaluation/_runner.ts | 19 ++- js/src/evaluation/evaluator.ts | 32 +++++ js/src/tests/evaluate.int.test.ts | 206 ++++++++++++++++++++++++++++++ 3 files changed, 253 insertions(+), 4 deletions(-) diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 93896aaf1..19a1a1e86 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -43,10 +43,21 @@ export type SummaryEvaluatorT = export type EvaluatorT = | RunEvaluator | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) - | (( - run: Run, - example?: Example - ) => Promise); + | ((run: Run, example?: Example) => Promise) + | ((args: { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => EvaluationResult | EvaluationResults) + | ((args: { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => Promise); interface _ForwardResults { run: Run; diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts index 92777082b..59bc636df 100644 --- a/js/src/evaluation/evaluator.ts +++ b/js/src/evaluation/evaluator.ts @@ -110,6 +110,21 @@ export class DynamicRunEvaluator any> langSmithRunAndExample: { run: Run; example: Example }; }) => { const { run, example } = input.langSmithRunAndExample; + + // Check if the evaluator expects the new argument format + const params = getEvaluatorParameters(evaluator); + + if (params.type === "object") { + return evaluator({ + run, + example, + inputs: example?.inputs, + outputs: run?.outputs, + referenceOutputs: example?.outputs + }); + } + + // Fallback to original behavior for backward compatibility return evaluator(run, example); }) as Func; } @@ -225,3 +240,20 @@ export class DynamicRunEvaluator any> export function runEvaluator(func: RunEvaluatorLike): RunEvaluator { return new DynamicRunEvaluator(func); } + +function getEvaluatorParameters(func: Function): { type: "tuple" | "object" } { + const funcStr = func.toString(); + + // Check if the function accepts a single object parameter + if (funcStr.match(/^\s*(?:async\s+)?(?:function\s*)?\(?\s*{\s*[a-zA-Z_]/)) { + return { type: "object" }; + } + + // Check if the function accepts run and example parameters + if (funcStr.match(/^\s*(?:async\s+)?(?:function\s*)?\(?\s*run\s*,\s*example/)) { + return { type: "tuple" }; + } + + // Default to object type for any other case + return { type: "object" }; +} diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 1d62dd6d9..1ba159704 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -770,3 +770,209 @@ test("evaluate accepts evaluators which return multiple feedback keys", async () { key: "second-key", score: 2, comment }, ]); }); + +test("evaluate can handle evaluators with object parameters", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + const objectEvaluator = ({ + inputs, + outputs, + referenceOutputs, + }: { + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => { + return { + key: "object_evaluator", + score: outputs?.foo === referenceOutputs?.output ? 1 : 0, + comment: `Input: ${inputs?.input}, Output: ${outputs?.foo}, Expected: ${referenceOutputs?.output}`, + }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [objectEvaluator], + description: "evaluate can handle evaluators with object parameters", + }); + + expect(evalRes.results).toHaveLength(2); + + // Check first result + const firstResult = evalRes.results[0]; + expect(firstResult.evaluationResults.results).toHaveLength(1); + const firstEval = firstResult.evaluationResults.results[0]; + expect(firstEval.key).toBe("object_evaluator"); + expect(firstEval.score).toBeDefined(); + expect(firstEval.comment).toContain("Input:"); + expect(firstEval.comment).toContain("Output:"); + expect(firstEval.comment).toContain("Expected:"); + + // Check second result + const secondResult = evalRes.results[1]; + expect(secondResult.evaluationResults.results).toHaveLength(1); + const secondEval = secondResult.evaluationResults.results[0]; + expect(secondEval.key).toBe("object_evaluator"); + expect(secondEval.score).toBeDefined(); + expect(secondEval.comment).toContain("Input:"); + expect(secondEval.comment).toContain("Output:"); + expect(secondEval.comment).toContain("Expected:"); +}); + +test("evaluate can mix evaluators with different parameter styles", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + // Traditional style evaluator + const traditionalEvaluator = (run: Run, example?: Example) => { + return { + key: "traditional", + score: run.outputs?.foo === example?.outputs?.output ? 1 : 0, + }; + }; + + // Object style evaluator + const objectEvaluator = ({ + outputs, + referenceOutputs, + }: { + outputs?: Record; + referenceOutputs?: Record; + }) => { + return { + key: "object_style", + score: outputs?.foo === referenceOutputs?.output ? 1 : 0, + }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [traditionalEvaluator, objectEvaluator], + description: "evaluate can mix evaluators with different parameter styles", + }); + + expect(evalRes.results).toHaveLength(2); + + // Check both evaluators ran for each example + for (const result of evalRes.results) { + expect(result.evaluationResults.results).toHaveLength(2); + + const traditionalResult = result.evaluationResults.results.find( + (r) => r.key === "traditional" + ); + expect(traditionalResult).toBeDefined(); + expect(typeof traditionalResult?.score).toBe("number"); + + const objectResult = result.evaluationResults.results.find( + (r) => r.key === "object_style" + ); + expect(objectResult).toBeDefined(); + expect(typeof objectResult?.score).toBe("number"); + } +}); + +test("evaluate handles partial object parameters correctly", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + // Evaluator that only uses outputs and referenceOutputs + const outputOnlyEvaluator = ({ + outputs, + referenceOutputs, + }: { + outputs?: Record; + referenceOutputs?: Record; + }) => { + return { + key: "output_only", + score: outputs?.foo === referenceOutputs?.output ? 1 : 0, + }; + }; + + // Evaluator that only uses run and example + const runOnlyEvaluator = ({ + run, + example, + }: { + run?: Run; + example?: Example; + }) => { + return { + key: "run_only", + score: run?.outputs?.foo === example?.outputs?.output ? 1 : 0, + }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [outputOnlyEvaluator, runOnlyEvaluator], + description: "evaluate handles partial object parameters correctly", + }); + + expect(evalRes.results).toHaveLength(2); + + // Check both evaluators ran for each example + for (const result of evalRes.results) { + expect(result.evaluationResults.results).toHaveLength(2); + + const outputResult = result.evaluationResults.results.find( + (r) => r.key === "output_only" + ); + expect(outputResult).toBeDefined(); + expect(typeof outputResult?.score).toBe("number"); + + const runResult = result.evaluationResults.results.find( + (r) => r.key === "run_only" + ); + expect(runResult).toBeDefined(); + expect(typeof runResult?.score).toBe("number"); + } +}); + +test("evaluate handles async object-style evaluators", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + const asyncEvaluator = async ({ + outputs, + referenceOutputs, + }: { + outputs?: Record; + referenceOutputs?: Record; + }) => { + // Simulate async operation + await new Promise((resolve) => setTimeout(resolve, 10)); + return { + key: "async_evaluator", + score: outputs?.foo === referenceOutputs?.output ? 1 : 0, + }; + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + evaluators: [asyncEvaluator], + description: "evaluate handles async object-style evaluators", + }); + + expect(evalRes.results).toHaveLength(2); + + for (const result of evalRes.results) { + expect(result.evaluationResults.results).toHaveLength(1); + const evalResult = result.evaluationResults.results[0]; + expect(evalResult.key).toBe("async_evaluator"); + expect(typeof evalResult.score).toBe("number"); + } +}); From d8cbab7c64381f84108cc9134bdeeb55626c9b39 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 26 Nov 2024 20:49:29 -0800 Subject: [PATCH 02/13] fmt --- js/src/evaluation/_runner.ts | 5 ++++- js/src/evaluation/evaluator.ts | 18 ++++++++++-------- js/src/tests/evaluate.int.test.ts | 4 ++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 19a1a1e86..6a4be888e 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -43,7 +43,10 @@ export type SummaryEvaluatorT = export type EvaluatorT = | RunEvaluator | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) - | ((run: Run, example?: Example) => Promise) + | (( + run: Run, + example?: Example + ) => Promise) | ((args: { run?: Run; example?: Example; diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts index 59bc636df..c26c93536 100644 --- a/js/src/evaluation/evaluator.ts +++ b/js/src/evaluation/evaluator.ts @@ -110,20 +110,20 @@ export class DynamicRunEvaluator any> langSmithRunAndExample: { run: Run; example: Example }; }) => { const { run, example } = input.langSmithRunAndExample; - + // Check if the evaluator expects the new argument format const params = getEvaluatorParameters(evaluator); - + if (params.type === "object") { return evaluator({ run, example, inputs: example?.inputs, outputs: run?.outputs, - referenceOutputs: example?.outputs + referenceOutputs: example?.outputs, }); } - + // Fallback to original behavior for backward compatibility return evaluator(run, example); }) as Func; @@ -243,17 +243,19 @@ export function runEvaluator(func: RunEvaluatorLike): RunEvaluator { function getEvaluatorParameters(func: Function): { type: "tuple" | "object" } { const funcStr = func.toString(); - + // Check if the function accepts a single object parameter if (funcStr.match(/^\s*(?:async\s+)?(?:function\s*)?\(?\s*{\s*[a-zA-Z_]/)) { return { type: "object" }; } - + // Check if the function accepts run and example parameters - if (funcStr.match(/^\s*(?:async\s+)?(?:function\s*)?\(?\s*run\s*,\s*example/)) { + if ( + funcStr.match(/^\s*(?:async\s+)?(?:function\s*)?\(?\s*run\s*,\s*example/) + ) { return { type: "tuple" }; } - + // Default to object type for any other case return { type: "object" }; } diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 1ba159704..75f1f76ca 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -863,7 +863,7 @@ test("evaluate can mix evaluators with different parameter styles", async () => // Check both evaluators ran for each example for (const result of evalRes.results) { expect(result.evaluationResults.results).toHaveLength(2); - + const traditionalResult = result.evaluationResults.results.find( (r) => r.key === "traditional" ); @@ -924,7 +924,7 @@ test("evaluate handles partial object parameters correctly", async () => { // Check both evaluators ran for each example for (const result of evalRes.results) { expect(result.evaluationResults.results).toHaveLength(2); - + const outputResult = result.evaluationResults.results.find( (r) => r.key === "output_only" ); From 0335961d592298d98625624db171bb22ec827435 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Tue, 26 Nov 2024 20:51:45 -0800 Subject: [PATCH 03/13] lint --- js/src/evaluation/evaluator.ts | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts index c26c93536..aa00421b3 100644 --- a/js/src/evaluation/evaluator.ts +++ b/js/src/evaluation/evaluator.ts @@ -241,7 +241,30 @@ export function runEvaluator(func: RunEvaluatorLike): RunEvaluator { return new DynamicRunEvaluator(func); } -function getEvaluatorParameters(func: Function): { type: "tuple" | "object" } { +// Define the object parameter type +type EvaluatorObjectParams = { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; +}; + +// Define all possible evaluator function signatures +type EvaluatorFunction = + | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) + | (( + run: Run, + example?: Example + ) => Promise) + | ((params: EvaluatorObjectParams) => EvaluationResult | EvaluationResults) + | (( + params: EvaluatorObjectParams + ) => Promise); + +function getEvaluatorParameters(func: EvaluatorFunction): { + type: "tuple" | "object"; +} { const funcStr = func.toString(); // Check if the function accepts a single object parameter From 637fd6ecc5b911088384af1bece694ef4a372bb9 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 08:33:42 -0800 Subject: [PATCH 04/13] cr --- js/src/evaluation/evaluator.ts | 58 ++++------------------------------ 1 file changed, 6 insertions(+), 52 deletions(-) diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts index aa00421b3..54b305f12 100644 --- a/js/src/evaluation/evaluator.ts +++ b/js/src/evaluation/evaluator.ts @@ -111,21 +111,17 @@ export class DynamicRunEvaluator any> }) => { const { run, example } = input.langSmithRunAndExample; - // Check if the evaluator expects the new argument format - const params = getEvaluatorParameters(evaluator); - - if (params.type === "object") { - return evaluator({ + return evaluator( + { + ...run, run, example, inputs: example?.inputs, outputs: run?.outputs, referenceOutputs: example?.outputs, - }); - } - - // Fallback to original behavior for backward compatibility - return evaluator(run, example); + }, + example + ); }) as Func; } @@ -240,45 +236,3 @@ export class DynamicRunEvaluator any> export function runEvaluator(func: RunEvaluatorLike): RunEvaluator { return new DynamicRunEvaluator(func); } - -// Define the object parameter type -type EvaluatorObjectParams = { - run?: Run; - example?: Example; - inputs?: Record; - outputs?: Record; - referenceOutputs?: Record; -}; - -// Define all possible evaluator function signatures -type EvaluatorFunction = - | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) - | (( - run: Run, - example?: Example - ) => Promise) - | ((params: EvaluatorObjectParams) => EvaluationResult | EvaluationResults) - | (( - params: EvaluatorObjectParams - ) => Promise); - -function getEvaluatorParameters(func: EvaluatorFunction): { - type: "tuple" | "object"; -} { - const funcStr = func.toString(); - - // Check if the function accepts a single object parameter - if (funcStr.match(/^\s*(?:async\s+)?(?:function\s*)?\(?\s*{\s*[a-zA-Z_]/)) { - return { type: "object" }; - } - - // Check if the function accepts run and example parameters - if ( - funcStr.match(/^\s*(?:async\s+)?(?:function\s*)?\(?\s*run\s*,\s*example/) - ) { - return { type: "tuple" }; - } - - // Default to object type for any other case - return { type: "object" }; -} From 355ced58c7546abe34be17146da32cbec124d504 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 09:23:36 -0800 Subject: [PATCH 05/13] cr --- js/src/evaluation/evaluator.ts | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/js/src/evaluation/evaluator.ts b/js/src/evaluation/evaluator.ts index 54b305f12..cad4707f1 100644 --- a/js/src/evaluation/evaluator.ts +++ b/js/src/evaluation/evaluator.ts @@ -95,7 +95,21 @@ export type RunEvaluatorLike = run: Run, example?: Example ) => Promise) - | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults); + | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) + | ((args: { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => EvaluationResult | EvaluationResults) + | ((args: { + run?: Run; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => Promise); /** * Wraps an evaluator function + implements the RunEvaluator interface. From f9d8632951779427be02957849ec95ca00340f4b Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 11:02:18 -0800 Subject: [PATCH 06/13] summary --- js/src/evaluation/_runner.ts | 29 ++++++- js/src/tests/evaluate.int.test.ts | 123 ++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 2 deletions(-) diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 6a4be888e..4507a81dc 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -37,7 +37,21 @@ export type SummaryEvaluatorT = | (( runs: Array, examples: Array - ) => EvaluationResult | EvaluationResults); + ) => EvaluationResult | EvaluationResults) + | ((args: { + runs?: Array; + examples?: Array; + inputs?: Array>; + outputs?: Array>; + referenceOutputs?: Array>; + }) => EvaluationResult | EvaluationResults) + | ((args: { + runs?: Array; + example?: Example; + inputs?: Record; + outputs?: Record; + referenceOutputs?: Record; + }) => Promise); // Row-level evaluator export type EvaluatorT = @@ -671,7 +685,18 @@ export class _ExperimentManager { for (const evaluator of wrappedEvaluators) { try { - const summaryEvalResult = await evaluator(runsArray, examples); + const inputs = examples.map((example) => example.inputs); + const outputs = runsArray.map((run) => run.outputs); + const referenceOutputs = examples.map((example) => example.outputs); + + const summaryEvalResult = await evaluator({ + runs: runsArray, + examples, + inputs, + outputs, + referenceOutputs, + }); + const flattenedResults = this.client._selectEvalResults(summaryEvalResult); aggregateFeedback.push(...flattenedResults); diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 75f1f76ca..023de5706 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -976,3 +976,126 @@ test("evaluate handles async object-style evaluators", async () => { expect(typeof evalResult.score).toBe("number"); } }); + +test("evaluate can evaluate with updated summary evaluators", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + const customSummaryEvaluator = ({ + runs, + examples, + inputs, + outputs, + referenceOutputs, + }: { + runs: Run[]; + examples: Example[]; + inputs: Record[]; + outputs: Record[]; + referenceOutputs: Record[]; + }): Promise => { + const runIds = runs.map(({ id }) => id).join(", "); + const exampleIds = examples.map(({ id }) => id).join(", "); + const inputValues = inputs.map(input => input.input).join(", "); + const outputValues = outputs.map(output => output.foo).join(", "); + const referenceOutputValues = referenceOutputs.map(ref => ref.output).join(", "); + + return Promise.resolve({ + key: "UpdatedSummaryEvaluator", + score: 1, + comment: `Runs: ${runIds} Examples: ${exampleIds} Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues}`, + }); + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + summaryEvaluators: [customSummaryEvaluator], + description: "evaluate can evaluate with updated summary evaluators", + }); + + expect(evalRes.summaryResults.results).toHaveLength(1); + expect(evalRes.summaryResults.results[0].key).toBe("UpdatedSummaryEvaluator"); + expect(evalRes.summaryResults.results[0].score).toBe(1); + + const allRuns = evalRes.results.map(({ run }) => run); + const allExamples = evalRes.results.map(({ example }) => example); + const allInputs = evalRes.results.map(({ example }) => example.inputs); + const allOutputs = evalRes.results.map(({ run }) => run.outputs); + const allReferenceOutputs = evalRes.results.map(({ example }) => example.outputs); + + const runIds = allRuns.map(({ id }) => id).join(", "); + const exampleIds = allExamples.map(({ id }) => id).join(", "); + const inputValues = allInputs.map(input => input.input).join(", "); + const outputValues = allOutputs.map(output => output.foo).join(", "); + const referenceOutputValues = allReferenceOutputs.map(ref => ref.output).join(", "); + + expect(evalRes.summaryResults.results[0].comment).toBe( + `Runs: ${runIds} Examples: ${exampleIds} Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues}` + ); +}); + +test("evaluate handles partial summary evaluator parameters correctly", async () => { + const targetFunc = (input: Record) => { + return { + foo: input.input + 1, + }; + }; + + // Summary evaluator that only uses inputs, outputs, and referenceOutputs + const outputOnlySummaryEvaluator = ({ + inputs, + outputs, + referenceOutputs, + }: { + inputs: Record[]; + outputs: Record[]; + referenceOutputs: Record[]; + }): Promise => { + const inputValues = inputs.map(input => input.input).join(", "); + const outputValues = outputs.map(output => output.foo).join(", "); + const referenceOutputValues = referenceOutputs.map(ref => ref.output).join(", "); + + // Calculate average difference between outputs and reference outputs + const avgDiff = outputs.reduce((sum, output, i) => { + return sum + Math.abs(output.foo - referenceOutputs[i].output); + }, 0) / outputs.length; + + return Promise.resolve({ + key: "OutputOnlySummaryEvaluator", + score: avgDiff === 0 ? 1 : 0, + comment: `Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues} AvgDiff: ${avgDiff}`, + }); + }; + + const evalRes = await evaluate(targetFunc, { + data: TESTING_DATASET_NAME, + summaryEvaluators: [outputOnlySummaryEvaluator], + description: "evaluate handles partial summary evaluator parameters", + }); + + expect(evalRes.summaryResults.results).toHaveLength(1); + const summaryResult = evalRes.summaryResults.results[0]; + expect(summaryResult.key).toBe("OutputOnlySummaryEvaluator"); + expect(typeof summaryResult.score).toBe("number"); + + // Verify the comment contains all the expected parts + const allInputs = evalRes.results.map(({ example }) => example.inputs); + const allOutputs = evalRes.results.map(({ run }) => run.outputs); + const allReferenceOutputs = evalRes.results.map(({ example }) => example.outputs); + + const inputValues = allInputs.map(input => input.input).join(", "); + const outputValues = allOutputs.map(output => output.foo).join(", "); + const referenceOutputValues = allReferenceOutputs.map(ref => ref.output).join(", "); + + // Calculate expected average difference + const expectedAvgDiff = allOutputs.reduce((sum, output, i) => { + return sum + Math.abs(output.foo - allReferenceOutputs[i].output); + }, 0) / allOutputs.length; + + expect(summaryResult.comment).toBe( + `Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues} AvgDiff: ${expectedAvgDiff}` + ); +}); From f21cffce2ba129027bda1586a2a0a4058d58fccf Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 11:03:13 -0800 Subject: [PATCH 07/13] fmt --- js/src/tests/evaluate.int.test.ts | 60 +++++++++++++++++++------------ 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 023de5706..09306bb7c 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -999,10 +999,12 @@ test("evaluate can evaluate with updated summary evaluators", async () => { }): Promise => { const runIds = runs.map(({ id }) => id).join(", "); const exampleIds = examples.map(({ id }) => id).join(", "); - const inputValues = inputs.map(input => input.input).join(", "); - const outputValues = outputs.map(output => output.foo).join(", "); - const referenceOutputValues = referenceOutputs.map(ref => ref.output).join(", "); - + const inputValues = inputs.map((input) => input.input).join(", "); + const outputValues = outputs.map((output) => output.foo).join(", "); + const referenceOutputValues = referenceOutputs + .map((ref) => ref.output) + .join(", "); + return Promise.resolve({ key: "UpdatedSummaryEvaluator", score: 1, @@ -1024,13 +1026,17 @@ test("evaluate can evaluate with updated summary evaluators", async () => { const allExamples = evalRes.results.map(({ example }) => example); const allInputs = evalRes.results.map(({ example }) => example.inputs); const allOutputs = evalRes.results.map(({ run }) => run.outputs); - const allReferenceOutputs = evalRes.results.map(({ example }) => example.outputs); + const allReferenceOutputs = evalRes.results.map( + ({ example }) => example.outputs + ); const runIds = allRuns.map(({ id }) => id).join(", "); const exampleIds = allExamples.map(({ id }) => id).join(", "); - const inputValues = allInputs.map(input => input.input).join(", "); - const outputValues = allOutputs.map(output => output.foo).join(", "); - const referenceOutputValues = allReferenceOutputs.map(ref => ref.output).join(", "); + const inputValues = allInputs.map((input) => input.input).join(", "); + const outputValues = allOutputs.map((output) => output.foo).join(", "); + const referenceOutputValues = allReferenceOutputs + .map((ref) => ref.output) + .join(", "); expect(evalRes.summaryResults.results[0].comment).toBe( `Runs: ${runIds} Examples: ${exampleIds} Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues}` @@ -1054,15 +1060,18 @@ test("evaluate handles partial summary evaluator parameters correctly", async () outputs: Record[]; referenceOutputs: Record[]; }): Promise => { - const inputValues = inputs.map(input => input.input).join(", "); - const outputValues = outputs.map(output => output.foo).join(", "); - const referenceOutputValues = referenceOutputs.map(ref => ref.output).join(", "); - + const inputValues = inputs.map((input) => input.input).join(", "); + const outputValues = outputs.map((output) => output.foo).join(", "); + const referenceOutputValues = referenceOutputs + .map((ref) => ref.output) + .join(", "); + // Calculate average difference between outputs and reference outputs - const avgDiff = outputs.reduce((sum, output, i) => { - return sum + Math.abs(output.foo - referenceOutputs[i].output); - }, 0) / outputs.length; - + const avgDiff = + outputs.reduce((sum, output, i) => { + return sum + Math.abs(output.foo - referenceOutputs[i].output); + }, 0) / outputs.length; + return Promise.resolve({ key: "OutputOnlySummaryEvaluator", score: avgDiff === 0 ? 1 : 0, @@ -1084,16 +1093,21 @@ test("evaluate handles partial summary evaluator parameters correctly", async () // Verify the comment contains all the expected parts const allInputs = evalRes.results.map(({ example }) => example.inputs); const allOutputs = evalRes.results.map(({ run }) => run.outputs); - const allReferenceOutputs = evalRes.results.map(({ example }) => example.outputs); + const allReferenceOutputs = evalRes.results.map( + ({ example }) => example.outputs + ); - const inputValues = allInputs.map(input => input.input).join(", "); - const outputValues = allOutputs.map(output => output.foo).join(", "); - const referenceOutputValues = allReferenceOutputs.map(ref => ref.output).join(", "); + const inputValues = allInputs.map((input) => input.input).join(", "); + const outputValues = allOutputs.map((output) => output.foo).join(", "); + const referenceOutputValues = allReferenceOutputs + .map((ref) => ref.output) + .join(", "); // Calculate expected average difference - const expectedAvgDiff = allOutputs.reduce((sum, output, i) => { - return sum + Math.abs(output.foo - allReferenceOutputs[i].output); - }, 0) / allOutputs.length; + const expectedAvgDiff = + allOutputs.reduce((sum, output, i) => { + return sum + Math.abs(output.foo - allReferenceOutputs[i].output); + }, 0) / allOutputs.length; expect(summaryResult.comment).toBe( `Inputs: ${inputValues} Outputs: ${outputValues} ReferenceOutputs: ${referenceOutputValues} AvgDiff: ${expectedAvgDiff}` From 55a8aa65ba26494f92fa746c3e844420e3d15437 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 11:13:50 -0800 Subject: [PATCH 08/13] fmt --- js/src/evaluation/_runner.ts | 37 ++++++++++++------ js/src/tests/evaluate.int.test.ts | 62 ++++++++++++++++--------------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 4507a81dc..79617865b 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -685,17 +685,7 @@ export class _ExperimentManager { for (const evaluator of wrappedEvaluators) { try { - const inputs = examples.map((example) => example.inputs); - const outputs = runsArray.map((run) => run.outputs); - const referenceOutputs = examples.map((example) => example.outputs); - - const summaryEvalResult = await evaluator({ - runs: runsArray, - examples, - inputs, - outputs, - referenceOutputs, - }); + const summaryEvalResult = await evaluator(runsArray, examples); const flattenedResults = this.client._selectEvalResults(summaryEvalResult); @@ -1004,6 +994,31 @@ async function wrapSummaryEvaluators( _runs_: string, _examples_: string ): Promise => { + // Check if the evaluator expects an object parameter + if (evaluator.length === 1) { + const inputs = examples.map((ex) => ex.inputs); + const outputs = runs.map((run) => run.outputs || {}); + const referenceOutputs = examples.map((ex) => ex.outputs || {}); + + return Promise.resolve( + ( + evaluator as (args: { + runs?: Run[]; + examples?: Example[]; + inputs?: Record[]; + outputs?: Record[]; + referenceOutputs?: Record[]; + }) => EvaluationResult | EvaluationResults + )({ + runs, + examples, + inputs, + outputs, + referenceOutputs, + }) + ); + } + // Otherwise use the traditional (runs, examples) signature return Promise.resolve(evaluator(runs, examples)); }, { ...optionsArray, name: evalName } diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 09306bb7c..08c0545fc 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -984,25 +984,22 @@ test("evaluate can evaluate with updated summary evaluators", async () => { }; }; - const customSummaryEvaluator = ({ - runs, - examples, - inputs, - outputs, - referenceOutputs, - }: { - runs: Run[]; - examples: Example[]; - inputs: Record[]; - outputs: Record[]; - referenceOutputs: Record[]; - }): Promise => { + // Update the type to match SummaryEvaluatorT + const customSummaryEvaluator = ( + runs: Run[], + examples: Example[] + ): Promise => { const runIds = runs.map(({ id }) => id).join(", "); const exampleIds = examples.map(({ id }) => id).join(", "); + const inputs = examples.map((ex) => ex.inputs); + const outputs = runs.map((run) => run.outputs || {}); + const referenceOutputs = examples.map((ex) => ex.outputs || {}); + const inputValues = inputs.map((input) => input.input).join(", "); const outputValues = outputs.map((output) => output.foo).join(", "); const referenceOutputValues = referenceOutputs .map((ref) => ref.output) + .filter((output): output is number => output !== undefined) .join(", "); return Promise.resolve({ @@ -1025,9 +1022,9 @@ test("evaluate can evaluate with updated summary evaluators", async () => { const allRuns = evalRes.results.map(({ run }) => run); const allExamples = evalRes.results.map(({ example }) => example); const allInputs = evalRes.results.map(({ example }) => example.inputs); - const allOutputs = evalRes.results.map(({ run }) => run.outputs); + const allOutputs = evalRes.results.map(({ run }) => run.outputs || {}); const allReferenceOutputs = evalRes.results.map( - ({ example }) => example.outputs + ({ example }) => example.outputs || {} ); const runIds = allRuns.map(({ id }) => id).join(", "); @@ -1036,6 +1033,7 @@ test("evaluate can evaluate with updated summary evaluators", async () => { const outputValues = allOutputs.map((output) => output.foo).join(", "); const referenceOutputValues = allReferenceOutputs .map((ref) => ref.output) + .filter((output): output is number => output !== undefined) .join(", "); expect(evalRes.summaryResults.results[0].comment).toBe( @@ -1050,26 +1048,28 @@ test("evaluate handles partial summary evaluator parameters correctly", async () }; }; - // Summary evaluator that only uses inputs, outputs, and referenceOutputs - const outputOnlySummaryEvaluator = ({ - inputs, - outputs, - referenceOutputs, - }: { - inputs: Record[]; - outputs: Record[]; - referenceOutputs: Record[]; - }): Promise => { + // Update to match SummaryEvaluatorT type + const outputOnlySummaryEvaluator = ( + runs: Run[], + examples: Example[] + ): Promise => { + const inputs = examples.map((ex) => ex.inputs); + const outputs = runs.map((run) => run.outputs || {}); + const referenceOutputs = examples.map((ex) => ex.outputs || {}); + const inputValues = inputs.map((input) => input.input).join(", "); const outputValues = outputs.map((output) => output.foo).join(", "); const referenceOutputValues = referenceOutputs .map((ref) => ref.output) + .filter((output): output is number => output !== undefined) .join(", "); // Calculate average difference between outputs and reference outputs const avgDiff = outputs.reduce((sum, output, i) => { - return sum + Math.abs(output.foo - referenceOutputs[i].output); + const refOutput = referenceOutputs[i]?.output; + if (output.foo === undefined || refOutput === undefined) return sum; + return sum + Math.abs(output.foo - refOutput); }, 0) / outputs.length; return Promise.resolve({ @@ -1090,23 +1090,25 @@ test("evaluate handles partial summary evaluator parameters correctly", async () expect(summaryResult.key).toBe("OutputOnlySummaryEvaluator"); expect(typeof summaryResult.score).toBe("number"); - // Verify the comment contains all the expected parts const allInputs = evalRes.results.map(({ example }) => example.inputs); - const allOutputs = evalRes.results.map(({ run }) => run.outputs); + const allOutputs = evalRes.results.map(({ run }) => run.outputs || {}); const allReferenceOutputs = evalRes.results.map( - ({ example }) => example.outputs + ({ example }) => example.outputs || {} ); const inputValues = allInputs.map((input) => input.input).join(", "); const outputValues = allOutputs.map((output) => output.foo).join(", "); const referenceOutputValues = allReferenceOutputs .map((ref) => ref.output) + .filter((output): output is number => output !== undefined) .join(", "); // Calculate expected average difference const expectedAvgDiff = allOutputs.reduce((sum, output, i) => { - return sum + Math.abs(output.foo - allReferenceOutputs[i].output); + const refOutput = allReferenceOutputs[i]?.output; + if (output.foo === undefined || refOutput === undefined) return sum; + return sum + Math.abs(output.foo - refOutput); }, 0) / allOutputs.length; expect(summaryResult.comment).toBe( From fade984da0de5d9f8f92edeb221da04db3a8f62f Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 11:23:35 -0800 Subject: [PATCH 09/13] fix ci --- .github/workflows/integration_tests.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 43bd5ba95..be321d14d 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -4,16 +4,10 @@ on: push: branches: - main - paths: - - 'python/**' - - 'js/**' pull_request: branches: - main types: [opened, synchronize, reopened, labeled, unlabeled] - paths: - - 'python/**' - - 'js/**' workflow_dispatch: inputs: run-python-tests: From 8d3bbdcd3aabf940ca5e0aef1d7bc9b56f19cbd8 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 11:58:51 -0800 Subject: [PATCH 10/13] undo --- js/src/tests/evaluate.int.test.ts | 62 +++++++++++++++---------------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 08c0545fc..09306bb7c 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -984,22 +984,25 @@ test("evaluate can evaluate with updated summary evaluators", async () => { }; }; - // Update the type to match SummaryEvaluatorT - const customSummaryEvaluator = ( - runs: Run[], - examples: Example[] - ): Promise => { + const customSummaryEvaluator = ({ + runs, + examples, + inputs, + outputs, + referenceOutputs, + }: { + runs: Run[]; + examples: Example[]; + inputs: Record[]; + outputs: Record[]; + referenceOutputs: Record[]; + }): Promise => { const runIds = runs.map(({ id }) => id).join(", "); const exampleIds = examples.map(({ id }) => id).join(", "); - const inputs = examples.map((ex) => ex.inputs); - const outputs = runs.map((run) => run.outputs || {}); - const referenceOutputs = examples.map((ex) => ex.outputs || {}); - const inputValues = inputs.map((input) => input.input).join(", "); const outputValues = outputs.map((output) => output.foo).join(", "); const referenceOutputValues = referenceOutputs .map((ref) => ref.output) - .filter((output): output is number => output !== undefined) .join(", "); return Promise.resolve({ @@ -1022,9 +1025,9 @@ test("evaluate can evaluate with updated summary evaluators", async () => { const allRuns = evalRes.results.map(({ run }) => run); const allExamples = evalRes.results.map(({ example }) => example); const allInputs = evalRes.results.map(({ example }) => example.inputs); - const allOutputs = evalRes.results.map(({ run }) => run.outputs || {}); + const allOutputs = evalRes.results.map(({ run }) => run.outputs); const allReferenceOutputs = evalRes.results.map( - ({ example }) => example.outputs || {} + ({ example }) => example.outputs ); const runIds = allRuns.map(({ id }) => id).join(", "); @@ -1033,7 +1036,6 @@ test("evaluate can evaluate with updated summary evaluators", async () => { const outputValues = allOutputs.map((output) => output.foo).join(", "); const referenceOutputValues = allReferenceOutputs .map((ref) => ref.output) - .filter((output): output is number => output !== undefined) .join(", "); expect(evalRes.summaryResults.results[0].comment).toBe( @@ -1048,28 +1050,26 @@ test("evaluate handles partial summary evaluator parameters correctly", async () }; }; - // Update to match SummaryEvaluatorT type - const outputOnlySummaryEvaluator = ( - runs: Run[], - examples: Example[] - ): Promise => { - const inputs = examples.map((ex) => ex.inputs); - const outputs = runs.map((run) => run.outputs || {}); - const referenceOutputs = examples.map((ex) => ex.outputs || {}); - + // Summary evaluator that only uses inputs, outputs, and referenceOutputs + const outputOnlySummaryEvaluator = ({ + inputs, + outputs, + referenceOutputs, + }: { + inputs: Record[]; + outputs: Record[]; + referenceOutputs: Record[]; + }): Promise => { const inputValues = inputs.map((input) => input.input).join(", "); const outputValues = outputs.map((output) => output.foo).join(", "); const referenceOutputValues = referenceOutputs .map((ref) => ref.output) - .filter((output): output is number => output !== undefined) .join(", "); // Calculate average difference between outputs and reference outputs const avgDiff = outputs.reduce((sum, output, i) => { - const refOutput = referenceOutputs[i]?.output; - if (output.foo === undefined || refOutput === undefined) return sum; - return sum + Math.abs(output.foo - refOutput); + return sum + Math.abs(output.foo - referenceOutputs[i].output); }, 0) / outputs.length; return Promise.resolve({ @@ -1090,25 +1090,23 @@ test("evaluate handles partial summary evaluator parameters correctly", async () expect(summaryResult.key).toBe("OutputOnlySummaryEvaluator"); expect(typeof summaryResult.score).toBe("number"); + // Verify the comment contains all the expected parts const allInputs = evalRes.results.map(({ example }) => example.inputs); - const allOutputs = evalRes.results.map(({ run }) => run.outputs || {}); + const allOutputs = evalRes.results.map(({ run }) => run.outputs); const allReferenceOutputs = evalRes.results.map( - ({ example }) => example.outputs || {} + ({ example }) => example.outputs ); const inputValues = allInputs.map((input) => input.input).join(", "); const outputValues = allOutputs.map((output) => output.foo).join(", "); const referenceOutputValues = allReferenceOutputs .map((ref) => ref.output) - .filter((output): output is number => output !== undefined) .join(", "); // Calculate expected average difference const expectedAvgDiff = allOutputs.reduce((sum, output, i) => { - const refOutput = allReferenceOutputs[i]?.output; - if (output.foo === undefined || refOutput === undefined) return sum; - return sum + Math.abs(output.foo - refOutput); + return sum + Math.abs(output.foo - allReferenceOutputs[i].output); }, 0) / allOutputs.length; expect(summaryResult.comment).toBe( From 1b62e2f72a762cc47a4b58bb044f2b7bda21c16e Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 12:11:17 -0800 Subject: [PATCH 11/13] fix --- js/src/evaluation/_runner.ts | 8 +++--- js/src/tests/evaluate.int.test.ts | 48 +++++++++++++++---------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index 79617865b..d6b97b484 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -47,10 +47,10 @@ export type SummaryEvaluatorT = }) => EvaluationResult | EvaluationResults) | ((args: { runs?: Array; - example?: Example; - inputs?: Record; - outputs?: Record; - referenceOutputs?: Record; + examples?: Array; + inputs?: Array>; + outputs?: Array>; + referenceOutputs?: Array>; }) => Promise); // Row-level evaluator diff --git a/js/src/tests/evaluate.int.test.ts b/js/src/tests/evaluate.int.test.ts index 09306bb7c..73de7246f 100644 --- a/js/src/tests/evaluate.int.test.ts +++ b/js/src/tests/evaluate.int.test.ts @@ -991,18 +991,18 @@ test("evaluate can evaluate with updated summary evaluators", async () => { outputs, referenceOutputs, }: { - runs: Run[]; - examples: Example[]; - inputs: Record[]; - outputs: Record[]; - referenceOutputs: Record[]; + runs?: Run[]; + examples?: Example[]; + inputs?: Record[]; + outputs?: Record[]; + referenceOutputs?: Record[]; }): Promise => { - const runIds = runs.map(({ id }) => id).join(", "); - const exampleIds = examples.map(({ id }) => id).join(", "); - const inputValues = inputs.map((input) => input.input).join(", "); - const outputValues = outputs.map((output) => output.foo).join(", "); + const runIds = runs?.map(({ id }) => id).join(", ") || ""; + const exampleIds = examples?.map(({ id }) => id).join(", "); + const inputValues = inputs?.map((input) => input.input).join(", "); + const outputValues = outputs?.map((output) => output.foo).join(", "); const referenceOutputValues = referenceOutputs - .map((ref) => ref.output) + ?.map((ref) => ref.output) .join(", "); return Promise.resolve({ @@ -1033,9 +1033,9 @@ test("evaluate can evaluate with updated summary evaluators", async () => { const runIds = allRuns.map(({ id }) => id).join(", "); const exampleIds = allExamples.map(({ id }) => id).join(", "); const inputValues = allInputs.map((input) => input.input).join(", "); - const outputValues = allOutputs.map((output) => output.foo).join(", "); + const outputValues = allOutputs.map((output) => output?.foo).join(", "); const referenceOutputValues = allReferenceOutputs - .map((ref) => ref.output) + .map((ref) => ref?.output) .join(", "); expect(evalRes.summaryResults.results[0].comment).toBe( @@ -1056,21 +1056,21 @@ test("evaluate handles partial summary evaluator parameters correctly", async () outputs, referenceOutputs, }: { - inputs: Record[]; - outputs: Record[]; - referenceOutputs: Record[]; + inputs?: Record[]; + outputs?: Record[]; + referenceOutputs?: Record[]; }): Promise => { - const inputValues = inputs.map((input) => input.input).join(", "); - const outputValues = outputs.map((output) => output.foo).join(", "); + const inputValues = inputs?.map((input) => input.input).join(", ") || ""; + const outputValues = outputs?.map((output) => output.foo).join(", ") || ""; const referenceOutputValues = referenceOutputs - .map((ref) => ref.output) + ?.map((ref) => ref?.output) .join(", "); // Calculate average difference between outputs and reference outputs const avgDiff = - outputs.reduce((sum, output, i) => { - return sum + Math.abs(output.foo - referenceOutputs[i].output); - }, 0) / outputs.length; + outputs?.reduce((sum, output, i) => { + return sum + Math.abs(output?.foo - referenceOutputs?.[i]?.output); + }, 0) || 0; return Promise.resolve({ key: "OutputOnlySummaryEvaluator", @@ -1098,15 +1098,15 @@ test("evaluate handles partial summary evaluator parameters correctly", async () ); const inputValues = allInputs.map((input) => input.input).join(", "); - const outputValues = allOutputs.map((output) => output.foo).join(", "); + const outputValues = allOutputs.map((output) => output?.foo).join(", "); const referenceOutputValues = allReferenceOutputs - .map((ref) => ref.output) + .map((ref) => ref?.output) .join(", "); // Calculate expected average difference const expectedAvgDiff = allOutputs.reduce((sum, output, i) => { - return sum + Math.abs(output.foo - allReferenceOutputs[i].output); + return sum + Math.abs(output?.foo - allReferenceOutputs[i]?.output); }, 0) / allOutputs.length; expect(summaryResult.comment).toBe( From cf813922d742325fddb5a312dbd8287ba4a72418 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Mon, 2 Dec 2024 13:25:45 -0800 Subject: [PATCH 12/13] deprecation --- js/src/evaluation/_runner.ts | 48 +++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/js/src/evaluation/_runner.ts b/js/src/evaluation/_runner.ts index d6b97b484..83414592a 100644 --- a/js/src/evaluation/_runner.ts +++ b/js/src/evaluation/_runner.ts @@ -27,17 +27,23 @@ export type TargetT = // Data format: dataset-name, dataset_id, or examples export type DataT = string | AsyncIterable | Example[]; -// Summary evaluator runs over the whole dataset // and reports aggregate metric(s) +/** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedSyncSummaryEvaluator = ( + runs: Array, + examples: Array +) => EvaluationResult | EvaluationResults; + +/** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedAsyncSummaryEvaluator = ( + runs: Array, + examples: Array +) => Promise; + +// Summary evaluator runs over the whole dataset export type SummaryEvaluatorT = - | (( - runs: Array, - examples: Array - ) => Promise) - | (( - runs: Array, - examples: Array - ) => EvaluationResult | EvaluationResults) + | DeprecatedSyncSummaryEvaluator + | DeprecatedAsyncSummaryEvaluator | ((args: { runs?: Array; examples?: Array; @@ -53,14 +59,26 @@ export type SummaryEvaluatorT = referenceOutputs?: Array>; }) => Promise); +/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedRunEvaluator = RunEvaluator; + +/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedFunctionEvaluator = ( + run: Run, + example?: Example +) => EvaluationResult | EvaluationResults; + +/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */ +type DeprecatedAsyncFunctionEvaluator = ( + run: Run, + example?: Example +) => Promise; + // Row-level evaluator export type EvaluatorT = - | RunEvaluator - | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults) - | (( - run: Run, - example?: Example - ) => Promise) + | DeprecatedRunEvaluator + | DeprecatedFunctionEvaluator + | DeprecatedAsyncFunctionEvaluator | ((args: { run?: Run; example?: Example; From b0bde016adc25314afbca1abdebbdfe2e1e2fc3e Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Mon, 2 Dec 2024 16:43:13 -0800 Subject: [PATCH 13/13] js[patch]: simple evaluateComparative evaluator args (#1277) --- js/src/evaluation/evaluate_comparative.ts | 45 ++++-- js/src/tests/evaluate_comparative.int.test.ts | 146 +++++++++++++++++- 2 files changed, 172 insertions(+), 19 deletions(-) diff --git a/js/src/evaluation/evaluate_comparative.ts b/js/src/evaluation/evaluate_comparative.ts index 5a67ee9f5..5b66e1530 100644 --- a/js/src/evaluation/evaluate_comparative.ts +++ b/js/src/evaluation/evaluate_comparative.ts @@ -69,16 +69,29 @@ async function loadTraces( return results; } +/** @deprecated Use ComparativeEvaluatorNew instead: (args: { runs, example, inputs, outputs, referenceOutputs }) => ... */ +export type _ComparativeEvaluatorLegacy = ( + runs: Run[], + example: Example +) => ComparisonEvaluationResultRow | Promise; + +export type _ComparativeEvaluator = (args: { + runs?: Run[]; + example?: Example; + inputs?: Record; + outputs?: Record[]; + referenceOutputs?: Record; +}) => ComparisonEvaluationResultRow | Promise; + +export type ComparativeEvaluator = + | _ComparativeEvaluatorLegacy + | _ComparativeEvaluator; + export interface EvaluateComparativeOptions { /** * A list of evaluators to use for comparative evaluation. */ - evaluators: Array< - ( - runs: Run[], - example: Example - ) => ComparisonEvaluationResultRow | Promise - >; + evaluators: Array; /** * Randomize the order of outputs for each evaluation * @default false @@ -306,16 +319,20 @@ export async function evaluateComparative( async function evaluateAndSubmitFeedback( runs: Run[], example: Example, - evaluator: ( - runs: Run[], - example: Example - ) => ComparisonEvaluationResultRow | Promise + evaluator: ComparativeEvaluator ) { const expectedRunIds = new Set(runs.map((r) => r.id)); - const result = await evaluator( - options.randomizeOrder ? shuffle(runs) : runs, - example - ); + // Check if evaluator expects an object parameter + const result = + evaluator.length === 1 + ? await (evaluator as _ComparativeEvaluator)({ + runs: options.randomizeOrder ? shuffle(runs) : runs, + example, + inputs: example.inputs, + outputs: runs.map((run) => run.outputs || {}), + referenceOutputs: example.outputs || {}, + }) + : await (evaluator as _ComparativeEvaluatorLegacy)(runs, example); for (const [runId, score] of Object.entries(result.scores)) { // validate if the run id diff --git a/js/src/tests/evaluate_comparative.int.test.ts b/js/src/tests/evaluate_comparative.int.test.ts index 81c14d653..3f8520b84 100644 --- a/js/src/tests/evaluate_comparative.int.test.ts +++ b/js/src/tests/evaluate_comparative.int.test.ts @@ -1,6 +1,10 @@ import { evaluate } from "../evaluation/_runner.js"; -import { evaluateComparative } from "../evaluation/evaluate_comparative.js"; +import { + evaluateComparative, + _ComparativeEvaluator, +} from "../evaluation/evaluate_comparative.js"; import { Client } from "../index.js"; +import { Run } from "../schemas.js"; import { waitUntilRunFound } from "./utils.js"; import { v4 as uuidv4 } from "uuid"; @@ -51,9 +55,11 @@ describe("evaluate comparative", () => { [firstEval.experimentName, secondEval.experimentName], { evaluators: [ - (runs) => ({ + ({ runs }: { runs?: Run[] }) => ({ key: "latter_precedence", - scores: Object.fromEntries(runs.map((run, i) => [run.id, i % 2])), + scores: Object.fromEntries( + runs?.map((run, i) => [run.id, i % 2]) ?? [] + ), }), ], } @@ -74,9 +80,11 @@ describe("evaluate comparative", () => { ], { evaluators: [ - (runs) => ({ + ({ runs }: { runs?: Run[] }) => ({ key: "latter_precedence", - scores: Object.fromEntries(runs.map((run, i) => [run.id, i % 2])), + scores: Object.fromEntries( + runs?.map((run, i) => [run.id, i % 2]) ?? [] + ), }), ], } @@ -84,4 +92,132 @@ describe("evaluate comparative", () => { expect(pairwise.results.length).toEqual(2); }); + + describe("evaluator formats", () => { + test("old format evaluator", async () => { + const pairwise = await evaluateComparative( + [ + evaluate((input) => ({ foo: `first:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + evaluate((input) => ({ foo: `second:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + ], + { + evaluators: [ + // Old format evaluator + (runs, example) => ({ + key: "old_format", + scores: Object.fromEntries( + runs.map((run) => [ + run.id, + run.outputs?.foo === `second:${example.inputs.input}` ? 1 : 0, + ]) + ), + }), + ], + } + ); + + expect(pairwise.results.length).toEqual(2); + expect(pairwise.results[0].key).toBe("old_format"); + // Second run in each pair should have score of 1 + expect(Object.values(pairwise.results[0].scores)).toEqual([0, 1]); + }); + + test("new format evaluator", async () => { + const matchesSecondEvaluator: _ComparativeEvaluator = ({ + runs, + inputs, + outputs, + }: { + runs?: Run[]; + inputs?: Record; + outputs?: Record[]; + }) => ({ + key: "new_format", + scores: Object.fromEntries( + // Add null checks for the optional parameters + runs?.map((run, i) => [ + run.id, + outputs?.[i]?.foo === `second:${inputs?.input}` ? 1 : 0, + ]) ?? [] + ), + }); + + const pairwise = await evaluateComparative( + [ + evaluate((input) => ({ foo: `first:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + evaluate((input) => ({ foo: `second:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + ], + { + evaluators: [matchesSecondEvaluator], + } + ); + + expect(pairwise.results.length).toEqual(2); + expect(pairwise.results[0].key).toBe("new_format"); + // Second run in each pair should have score of 1 + expect(Object.values(pairwise.results[0].scores)).toEqual([0, 1]); + }); + + test("mixed old and new format evaluators", async () => { + const matchesSecondEvaluator: _ComparativeEvaluator = ({ + runs, + inputs, + outputs, + }: { + runs?: Run[]; + inputs?: Record; + outputs?: Record[]; + }) => ({ + key: "new_format", + scores: Object.fromEntries( + runs?.map((run, i) => [ + run.id, + outputs?.[i]?.foo === `second:${inputs?.input}` ? 1 : 0, + ]) ?? [] + ), + }); + const pairwise = await evaluateComparative( + [ + evaluate((input) => ({ foo: `first:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + evaluate((input) => ({ foo: `second:${input.input}` }), { + data: TESTING_DATASET_NAME, + }), + ], + { + evaluators: [ + // Old format + (runs, example) => ({ + key: "old_format", + scores: Object.fromEntries( + runs.map((run) => [ + run.id, + run.outputs?.foo === `second:${example.inputs.input}` ? 1 : 0, + ]) + ), + }), + // New format + matchesSecondEvaluator, + ], + } + ); + + expect(pairwise.results.length).toEqual(4); // 2 examples × 2 evaluators + expect(pairwise.results.map((r) => r.key)).toContain("old_format"); + expect(pairwise.results.map((r) => r.key)).toContain("new_format"); + // Each evaluator should score the second run as 1 + pairwise.results.forEach((result) => { + expect(Object.values(result.scores)).toEqual([0, 1]); + }); + }); + }); });