From 08267af836c55a945abf93677a478b0c0dd25c52 Mon Sep 17 00:00:00 2001 From: Samuel Bushi Date: Tue, 4 Feb 2025 02:20:48 +0000 Subject: [PATCH 1/3] fix: eval plugin docs --- docs/plugin-authoring-evaluator.md | 34 ++++++++++++++---------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/docs/plugin-authoring-evaluator.md b/docs/plugin-authoring-evaluator.md index 6b8c58172..9ee8d218b 100644 --- a/docs/plugin-authoring-evaluator.md +++ b/docs/plugin-authoring-evaluator.md @@ -61,23 +61,22 @@ function getDeliciousnessPrompt(ai: Genkit) { output: { schema: DeliciousnessDetectionResponseSchema, } - }, - `You are a food critic. Assess whether the provided output sounds delicious, giving only "yes" (delicious), "no" (not delicious), or "maybe" (undecided) as the verdict. + prompt: `You are a food critic. Assess whether the provided output sounds delicious, giving only "yes" (delicious), "no" (not delicious), or "maybe" (undecided) as the verdict. - Examples: - Output: Chicken parm sandwich - Response: { "reason": "A classic and beloved dish.", "verdict": "yes" } + Examples: + Output: Chicken parm sandwich + Response: { "reason": "A classic and beloved dish.", "verdict": "yes" } - Output: Boston Logan Airport tarmac - Response: { "reason": "Not edible.", "verdict": "no" } + Output: Boston Logan Airport tarmac + Response: { "reason": "Not edible.", "verdict": "no" } - Output: A juicy piece of gossip - Response: { "reason": "Metaphorically 'tasty' but not food.", "verdict": "maybe" } + Output: A juicy piece of gossip + Response: { "reason": "Metaphorically 'tasty' but not food.", "verdict": "maybe" } - New Output: {% verbatim %}{{ responseToTest }} {% endverbatim %} - Response: - ` - ); + New Output: {% verbatim %}{{ responseToTest }} {% endverbatim %} + Response: + ` + }); } ``` @@ -91,7 +90,7 @@ responsibility of the evaluator to validate that all fields required for evaluation are present. ```ts -import { ModelArgument, z } from 'genkit'; +import { ModelArgument } from 'genkit'; import { BaseEvalDataPoint, Score } from 'genkit/evaluator'; /** @@ -100,6 +99,7 @@ import { BaseEvalDataPoint, Score } from 'genkit/evaluator'; export async function deliciousnessScore< CustomModelOptions extends z.ZodTypeAny, >( + ai: Genkit, judgeLlm: ModelArgument, dataPoint: BaseEvalDataPoint, judgeConfig?: CustomModelOptions @@ -141,8 +141,7 @@ export async function deliciousnessScore< The final step is to write a function that defines the `EvaluatorAction`. ```ts -import { Genkit, z } from 'genkit'; -import { BaseEvalDataPoint, EvaluatorAction } from 'genkit/evaluator'; +import { EvaluatorAction } from 'genkit/evaluator'; /** * Create the Deliciousness evaluator action. @@ -162,7 +161,7 @@ export function createDeliciousnessEvaluator< isBilled: true, }, async (datapoint: BaseEvalDataPoint) => { - const score = await deliciousnessScore(judge, datapoint, judgeConfig); + const score = await deliciousnessScore(ai, judge, datapoint, judgeConfig); return { testCaseId: datapoint.testCaseId, evaluation: score, @@ -245,7 +244,6 @@ As with the LLM-based evaluator, define the scoring function. In this case, the scoring function does not need a judge LLM. ```ts -import { EvalResponses } from 'genkit'; import { BaseEvalDataPoint, Score } from 'genkit/evaluator'; const US_PHONE_REGEX = From feda2a1c84d4b1090a81a07157dd077bde5432c3 Mon Sep 17 00:00:00 2001 From: ssbushi <66321939+ssbushi@users.noreply.github.com> Date: Tue, 4 Feb 2025 19:25:29 +0000 Subject: [PATCH 2/3] fix: evaluation.md (#1817) --- docs/evaluation.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/evaluation.md b/docs/evaluation.md index 385a6da04..74a378178 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -42,8 +42,8 @@ This section explains how to perform inference-based evaluation using Genkit. ### Setup
    -
  1. Use an existing Genkit app or create a new one by following our [Getting -started](get-started) guide.
  2. +
  3. Use an existing Genkit app or create a new one by following our [Get +started](get-started.md) guide.
  4. Add the following code to define a simple RAG application to evaluate. For this guide, we use a dummy retriever that always returns the same documents. @@ -52,7 +52,6 @@ import { genkit, z, Document } from "genkit"; import { googleAI, gemini15Flash, - gemini15Pro, } from "@genkit-ai/googleai"; // Initialize Genkit @@ -164,7 +163,7 @@ to open the Datasets page. c. Repeat steps (a) and (b) a couple more times to add more examples. This guide adds the following example inputs to the dataset: - ``` + ```none {:.devsite-disable-click-to-copy} "Can I give milk to my cats?" "From which animals did dogs evolve?" ``` @@ -174,8 +173,8 @@ to open the Datasets page. ### Run evaluation and view results -To start evaluating the flow, click the `Evaluations` tab in the Dev UI and -click the **Run new evaluation** button to get started. +To start evaluating the flow, click the **Run new evaluation** button on your +dataset page. You can also start a new evaluation from the `Evaluations` tab. 1. Select the `Flow` radio button to evaluate a flow. @@ -234,7 +233,7 @@ and is only enforced if a schema is specified on the target flow. control for advanced use cases (e.g. providing model parameters, message history, tools, etc). You can find the full schema for `GenerateRequest` in our [API reference - docs](https://js.api.genkit.dev/interfaces/genkit._.GenerateRequest.html). + docs](https://js.api.genkit.dev/interfaces/genkit._.GenerateRequest.html){: .external}. Note: Schema validation is a helper tool for editing examples, but it is possible to save an example with invalid schema. These examples may fail when @@ -245,7 +244,7 @@ the running an evaluation. ### Genkit evaluators Genkit includes a small number of native evaluators, inspired by -[RAGAS](https://docs.ragas.io/en/stable/), to help you get started: +[RAGAS](https://docs.ragas.io/en/stable/){: .external}, to help you get started: * Faithfulness -- Measures the factual consistency of the generated answer against the given context @@ -257,7 +256,7 @@ harm, or exploit ### Evaluator plugins Genkit supports additional evaluators through plugins, like the Vertex Rapid -Evaluators, which you access via the [VertexAI +Evaluators, which you can access via the [VertexAI Plugin](./plugins/vertex-ai#evaluators). ## Advanced use @@ -309,7 +308,7 @@ field and an optional `reference` field, like below: If your flow requires auth, you may specify it using the `--auth` argument: ```posix-terminal -genkit eval:flow qaFlow --input testInputs.json --auth "{\"email_verified\": true}" +genkit eval:flow qaFlow --input testInputs.json --auth '{"auth":{"email_verified":true}}' ``` By default, the `eval:flow` and `eval:run` commands use all available metrics @@ -317,7 +316,7 @@ for evaluation. To run on a subset of the configured evaluators, use the `--evaluators` flag and provide a comma-separated list of evaluators by name: ```posix-terminal -genkit eval:flow qaFlow --input testInputs.json --evaluators=genkit/faithfulness,genkit/answer_relevancy +genkit eval:flow qaFlow --input testInputs.json --evaluators=genkitEval/maliciousness,genkitEval/answer_relevancy ``` You can view the results of your evaluation run in the Dev UI at `localhost:4000/evaluate`. @@ -385,6 +384,8 @@ First, as a preparatory step, introduce an auxilary step in our `qaFlow` example: ```js +import { run } from '@genkit-ai/core'; + export const qaFlow = ai.defineFlow({ name: 'qaFlow', inputSchema: z.string(), @@ -409,7 +410,7 @@ export const qaFlow = ai.defineFlow({ const llmResponse = await ai.generate({ model: gemini15Flash, prompt: `Answer this question with the given context ${query}`, - docs: factDocs, + docs: factDocsModified, }); return llmResponse.text; } @@ -483,7 +484,8 @@ Here is an example flow that uses a PDF file to generate potential user questions. ```ts -import { genkit, run, z } from "genkit"; +import { genkit, z } from "genkit"; +import { run } from "@genkit-ai/core"; import { googleAI, gemini15Flash } from "@genkit-ai/googleai"; import { chunk } from "llm-chunk"; // npm i llm-chunk import path from "path"; From cc7e2dad7d98c65d34f4ab5af6aae6ec49ba4ae4 Mon Sep 17 00:00:00 2001 From: Samuel Bushi Date: Tue, 4 Feb 2025 19:45:46 +0000 Subject: [PATCH 3/3] format --- docs/evaluation.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/docs/evaluation.md b/docs/evaluation.md index 9a7ec9363..e16443553 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -383,8 +383,6 @@ First, as a preparatory step, introduce an auxilary step in our `qaFlow` example: ```js -import { run } from '@genkit-ai/core'; - export const qaFlow = ai.defineFlow({ name: 'qaFlow', inputSchema: z.string(), @@ -394,9 +392,8 @@ export const qaFlow = ai.defineFlow({ const factDocs = await ai.retrieve({ retriever: dummyRetriever, query, - options: { k: 2 }, }); - const factDocsModified = await run('factModified', async () => { + const factDocsModified = await ai.run('factModified', async () => { // Let us use only facts that are considered silly. This is a // hypothetical step for demo purposes, you may perform any // arbitrary task inside a step and reference it in custom @@ -484,7 +481,6 @@ questions. ```ts import { genkit, z } from "genkit"; -import { run } from "@genkit-ai/core"; import { googleAI, gemini15Flash } from "@genkit-ai/googleai"; import { chunk } from "llm-chunk"; // npm i llm-chunk import path from "path"; @@ -517,9 +513,9 @@ export const synthesizeQuestions = ai.defineFlow( async (filePath) => { filePath = path.resolve(filePath); // `extractText` loads the PDF and extracts its contents as text. - const pdfTxt = await run("extract-text", () => extractText(filePath)); + const pdfTxt = await ai.run("extract-text", () => extractText(filePath)); - const chunks = await run("chunk-it", async () => + const chunks = await ai.run("chunk-it", async () => chunk(pdfTxt, chunkingConfig) );