Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

js[patch]: simple evaluator args #1264

Merged
merged 17 commits into from
Dec 3, 2024
102 changes: 87 additions & 15 deletions js/src/evaluation/_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,26 +27,72 @@ export type TargetT<TInput = any, TOutput = KVMap> =
// Data format: dataset-name, dataset_id, or examples
export type DataT = string | AsyncIterable<Example> | Example[];

// Summary evaluator runs over the whole dataset
// and reports aggregate metric(s)
/** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */
type DeprecatedSyncSummaryEvaluator = (
runs: Array<Run>,
examples: Array<Example>
) => EvaluationResult | EvaluationResults;

/** @deprecated Use object parameter version instead: (args: { runs, examples, inputs, outputs, referenceOutputs }) => ... */
type DeprecatedAsyncSummaryEvaluator = (
runs: Array<Run>,
examples: Array<Example>
) => Promise<EvaluationResult | EvaluationResults>;

// Summary evaluator runs over the whole dataset
export type SummaryEvaluatorT =
| ((
runs: Array<Run>,
examples: Array<Example>
) => Promise<EvaluationResult | EvaluationResults>)
| ((
runs: Array<Run>,
examples: Array<Example>
) => EvaluationResult | EvaluationResults);
| DeprecatedSyncSummaryEvaluator
| DeprecatedAsyncSummaryEvaluator
| ((args: {
runs?: Array<Run>;
examples?: Array<Example>;
inputs?: Array<Record<string, any>>;
outputs?: Array<Record<string, any>>;
referenceOutputs?: Array<Record<string, any>>;
}) => EvaluationResult | EvaluationResults)
| ((args: {
runs?: Array<Run>;
examples?: Array<Example>;
inputs?: Array<Record<string, any>>;
outputs?: Array<Record<string, any>>;
referenceOutputs?: Array<Record<string, any>>;
}) => Promise<EvaluationResult | EvaluationResults>);

/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
type DeprecatedRunEvaluator = RunEvaluator;

/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
type DeprecatedFunctionEvaluator = (
run: Run,
example?: Example
) => EvaluationResult | EvaluationResults;

/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
type DeprecatedAsyncFunctionEvaluator = (
run: Run,
example?: Example
) => Promise<EvaluationResult | EvaluationResults>;

// Row-level evaluator
export type EvaluatorT =
| RunEvaluator
| ((run: Run, example?: Example) => EvaluationResult | EvaluationResults)
| ((
run: Run,
example?: Example
) => Promise<EvaluationResult | EvaluationResults>);
| DeprecatedRunEvaluator
| DeprecatedFunctionEvaluator
| DeprecatedAsyncFunctionEvaluator
| ((args: {
run?: Run;
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>;
referenceOutputs?: Record<string, any>;
}) => EvaluationResult | EvaluationResults)
| ((args: {
run?: Run;
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>;
referenceOutputs?: Record<string, any>;
}) => Promise<EvaluationResult | EvaluationResults>);

interface _ForwardResults {
run: Run;
Expand Down Expand Up @@ -658,6 +704,7 @@ export class _ExperimentManager {
for (const evaluator of wrappedEvaluators) {
try {
const summaryEvalResult = await evaluator(runsArray, examples);

const flattenedResults =
this.client._selectEvalResults(summaryEvalResult);
aggregateFeedback.push(...flattenedResults);
Expand Down Expand Up @@ -965,6 +1012,31 @@ async function wrapSummaryEvaluators(
_runs_: string,
_examples_: string
): Promise<EvaluationResult | EvaluationResults> => {
// Check if the evaluator expects an object parameter
if (evaluator.length === 1) {
const inputs = examples.map((ex) => ex.inputs);
const outputs = runs.map((run) => run.outputs || {});
const referenceOutputs = examples.map((ex) => ex.outputs || {});

return Promise.resolve(
Copy link
Collaborator

@jacoblee93 jacoblee93 Dec 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need the Promise.resolve, more standard to just make the outside function async:

      const wrapperSuperInner = traceable(
        async (
          _runs_: string,
          _examples_: string
        ): Promise<EvaluationResult | EvaluationResults> => {

(
evaluator as (args: {
runs?: Run[];
dqbd marked this conversation as resolved.
Show resolved Hide resolved
examples?: Example[];
inputs?: Record<string, any>[];
outputs?: Record<string, any>[];
referenceOutputs?: Record<string, any>[];
}) => EvaluationResult | EvaluationResults
)({
runs,
examples,
inputs,
outputs,
referenceOutputs,
})
);
}
// Otherwise use the traditional (runs, examples) signature
return Promise.resolve(evaluator(runs, examples));
},
{ ...optionsArray, name: evalName }
Expand Down
45 changes: 31 additions & 14 deletions js/src/evaluation/evaluate_comparative.ts
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,29 @@ async function loadTraces(
return results;
}

/** @deprecated Use ComparativeEvaluatorNew instead: (args: { runs, example, inputs, outputs, referenceOutputs }) => ... */
export type _ComparativeEvaluatorLegacy = (
runs: Run[],
example: Example
) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>;

export type _ComparativeEvaluator = (args: {
runs?: Run[];
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>[];
referenceOutputs?: Record<string, any>;
}) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>;

export type ComparativeEvaluator =
| _ComparativeEvaluatorLegacy
| _ComparativeEvaluator;

export interface EvaluateComparativeOptions {
/**
* A list of evaluators to use for comparative evaluation.
*/
evaluators: Array<
(
runs: Run[],
example: Example
) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>
>;
evaluators: Array<ComparativeEvaluator>;
/**
* Randomize the order of outputs for each evaluation
* @default false
Expand Down Expand Up @@ -306,16 +319,20 @@ export async function evaluateComparative(
async function evaluateAndSubmitFeedback(
runs: Run[],
example: Example,
evaluator: (
runs: Run[],
example: Example
) => ComparisonEvaluationResultRow | Promise<ComparisonEvaluationResultRow>
evaluator: ComparativeEvaluator
) {
const expectedRunIds = new Set(runs.map((r) => r.id));
const result = await evaluator(
options.randomizeOrder ? shuffle(runs) : runs,
example
);
// Check if evaluator expects an object parameter
const result =
evaluator.length === 1
? await (evaluator as _ComparativeEvaluator)({
runs: options.randomizeOrder ? shuffle(runs) : runs,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth supporting evaluator(runs)?

example,
inputs: example.inputs,
outputs: runs.map((run) => run.outputs || {}),
referenceOutputs: example.outputs || {},
})
: await (evaluator as _ComparativeEvaluatorLegacy)(runs, example);

for (const [runId, score] of Object.entries(result.scores)) {
// validate if the run id
Expand Down
29 changes: 27 additions & 2 deletions js/src/evaluation/evaluator.ts
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably also update the types for RunEvaluatorLike above right?

Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,21 @@ export type RunEvaluatorLike =
run: Run,
example?: Example
) => Promise<EvaluationResult | EvaluationResults>)
| ((run: Run, example?: Example) => EvaluationResult | EvaluationResults);
| ((run: Run, example?: Example) => EvaluationResult | EvaluationResults)
| ((args: {
run?: Run;
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>;
referenceOutputs?: Record<string, any>;
}) => EvaluationResult | EvaluationResults)
| ((args: {
run?: Run;
example?: Example;
inputs?: Record<string, any>;
outputs?: Record<string, any>;
referenceOutputs?: Record<string, any>;
}) => Promise<EvaluationResult | EvaluationResults>);

/**
* Wraps an evaluator function + implements the RunEvaluator interface.
Expand All @@ -110,7 +124,18 @@ export class DynamicRunEvaluator<Func extends (...args: any[]) => any>
langSmithRunAndExample: { run: Run; example: Example };
}) => {
const { run, example } = input.langSmithRunAndExample;
return evaluator(run, example);

return evaluator(
{
...run,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just do the arity check for both of these?

Is there a world where there are some crazies out there declaring evaluators like this:

async (run) => {
}

I would prefer the shim in both places unless we want to drop support for the two arg syntax in the long run, in which case we should deprecate the other signature:

export type RunEvaluatorLike =
 /** @deprecated NOTE */
  | ((
      run: Run,
      example?: Example
    ) => Promise<EvaluationResult | EvaluationResults>)
 /** @deprecated NOTE */
  | ((run: Run, example?: Example) => EvaluationResult | EvaluationResults)
  | ((args: {
      run?: Run;
      example?: Example;
      inputs?: Record<string, any>;
      outputs?: Record<string, any>;
      referenceOutputs?: Record<string, any>;
    }) => EvaluationResult | EvaluationResults)
  | ((args: {
      run?: Run;
      example?: Example;
      inputs?: Record<string, any>;
      outputs?: Record<string, any>;
      referenceOutputs?: Record<string, any>;
    }) => Promise<EvaluationResult | EvaluationResults>);

run,
example,
inputs: example?.inputs,
outputs: run?.outputs,
referenceOutputs: example?.outputs,
},
example
);
}) as Func;
}

Expand Down
Loading
Loading