From 60c1c44d3e86aaffee8ce7df5b7f9eb160fceae2 Mon Sep 17 00:00:00 2001 From: Thomas Capelle Date: Fri, 8 Nov 2024 15:27:09 +0100 Subject: [PATCH] lint --- evalforge/__init__.py | 5 +- evalforge/alignment.py | 20 ++-- evalforge/cli.py | 72 +++++--------- evalforge/code_evaluator.py | 32 +++--- evalforge/code_formatter.py | 15 +-- evalforge/combined_scorer.py | 49 +++++----- evalforge/criterion_assertion_map.py | 12 +-- evalforge/data_utils.py | 140 +++++++++++++++++---------- evalforge/forge.py | 103 ++++++++++++-------- evalforge/instructor_models.py | 32 ++++-- evalforge/llm.py | 6 +- evalforge/llm_evaluator.py | 8 +- evalforge/prompts.py | 56 +++++++---- evalforge/utils.py | 96 +++++++++++------- forge_medical.py | 29 ++++-- forge_mini.py | 45 +++++---- tests/test_code_formatter.py | 39 +++++--- tests/test_combined_scorer.py | 46 ++++++++- tests/test_data_utils.py | 43 +++++--- tests/test_llm_evaluator.py | 29 +++--- tests/test_utils.py | 34 ++++--- 21 files changed, 548 insertions(+), 363 deletions(-) diff --git a/evalforge/__init__.py b/evalforge/__init__.py index 5ecb090..e1378c6 100644 --- a/evalforge/__init__.py +++ b/evalforge/__init__.py @@ -1,2 +1,5 @@ from evalforge.forge import EvalForge -from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics \ No newline at end of file +from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics + + +__all__ = ["EvalForge", "calculate_alignment_metrics", "format_alignment_metrics"] diff --git a/evalforge/alignment.py b/evalforge/alignment.py index 42aadbe..e45d076 100644 --- a/evalforge/alignment.py +++ b/evalforge/alignment.py @@ -168,15 +168,13 @@ def calculate_alignment_metrics( def select_best_assertions( - metrics: Dict[str, Any], - assertion_results: Dict[str, Dict[str, List[Tuple[Dict[str, Any], int]]]], - num_assertions_per_criterion: int = None, -) -> Dict[str, Dict[str, str]]: - - best_assertions = {} + criterion_assertion_results: Dict[str, Dict[str, Dict[str, Any]]], + num_assertions_per_criterion: Optional[int] = None, +) -> List[str]: + best_subset: List[str] = [] - for criterion in assertion_results.keys(): - all_assertions = list(assertion_results[criterion].keys()) + for criterion in criterion_assertion_results.keys(): + all_assertions = list(criterion_assertion_results[criterion].keys()) if not num_assertions_per_criterion: # Intelligently select the subset of assertions that maximize the criterion's alignment score @@ -192,7 +190,7 @@ def select_best_assertions( # Create subset of assertion_results subset_assertion_results = { criterion: { - assertion: assertion_results[criterion][assertion] + assertion: criterion_assertion_results[criterion][assertion] for assertion in subset } } @@ -305,7 +303,7 @@ def format_alignment_metrics(metrics, title: str = "Alignment Metrics"): criterion[:40].ljust(40), "", "", - f"{criterion_data['criterion_metrics']['alignment']:.2f}" + f"{criterion_data['criterion_metrics']['alignment']:.2f}", ) # Add rows for each assertion for assertion, assertion_data in criterion_data["per_assertion"].items(): @@ -313,7 +311,7 @@ def format_alignment_metrics(metrics, title: str = "Alignment Metrics"): "", assertion[:40].ljust(40), assertion_data["type"].ljust(9), - f"{assertion_data['alignment']:.2f}" + f"{assertion_data['alignment']:.2f}", ) # Print the table using the logger's console diff --git a/evalforge/cli.py b/evalforge/cli.py index 2c9e838..c3568c3 100644 --- a/evalforge/cli.py +++ b/evalforge/cli.py @@ -1,74 +1,49 @@ import asyncio +from typing import Optional import simple_parsing from simple_parsing import Serializable -from dataclasses import dataclass, Field +from dataclasses import dataclass import sys +import weave + from evalforge.forge import EvalForge from evalforge.utils import logger -from evalforge.data_utils import load_data, DataPoint +from evalforge.data_utils import load_data -train_ds_formatted = [ - DataPoint( - input_data={"text": "1+1="}, - output_data={"text": "2"}, - annotation=1, - note="Correct summation", - ), - DataPoint( - input_data={"text": "1+1="}, - output_data={"text": "3"}, - annotation=0, - note="Incorrect summation", - ), - DataPoint( - input_data={"text": "What is the square root of 16?"}, - output_data={"text": "4"}, - annotation=1, - note="Correct square root", - ), -] +MINI_DATASET_PATH = "data/mini_data.jsonl" -eval_ds_formatted = [ - DataPoint( - input_data={"text": "What is the square root of 16?"}, - output_data={"text": "4"}, - annotation=1, - note="Correct square root", - ), - DataPoint( - input_data={"text": "What is the square root of 16?"}, - output_data={"text": "3"}, - annotation=0, - note="Incorrect square root", - ), -] @dataclass class Args(Serializable): - data: str = "mini"# "Path to training data" - batch_size: int = 1 # "Batch size" - num_criteria_to_generate: int = 1 # "Number of criteria to generate" - llm_model: str = "gpt-4o" # "LLM model to use" + data: str = "mini" # "Path to training data" + batch_size: int = 1 # "Batch size" + num_criteria_to_generate: int = 1 # "Number of criteria to generate" + llm_model: str = "gpt-4o" # "LLM model to use" + weave_project: Optional[str] = None # "Weave project to use" + def forge(): logger.rule("EvalForge CLI") try: args = simple_parsing.parse(Args) - # Load the data + # Log into Weave + if args.weave_project: + weave.init(args.weave_project) + # Load the data if args.data == "mini": - logger.info(f"Running dummy data") - train_data = train_ds_formatted + logger.info("Running dummy data") + train_data = load_data(MINI_DATASET_PATH) else: logger.info(f"Loading data from {args.data}") train_data = load_data(args.data) - + forger = EvalForge( - batch_size=args.batch_size, - num_criteria_to_generate=args.num_criteria_to_generate, - llm_model=args.llm_model + batch_size=args.batch_size, + num_criteria_to_generate=args.num_criteria_to_generate, + llm_model=args.llm_model, ) # Run the fit method asyncio.run(forger.fit(train_data)) @@ -76,5 +51,6 @@ def forge(): print(f"An error occurred: {e}") sys.exit(1) + if __name__ == "__main__": - forge() \ No newline at end of file + forge() diff --git a/evalforge/code_evaluator.py b/evalforge/code_evaluator.py index 572ce3e..466ec7d 100644 --- a/evalforge/code_evaluator.py +++ b/evalforge/code_evaluator.py @@ -22,17 +22,19 @@ def score( self, model_output: Optional[Dict[str, Any]], input_data: Dict[str, Any], - **kwargs + **kwargs, ) -> Dict[str, Any]: if model_output is None: logger.error("No model output provided") - return {"code_assertion_results": { - "tests_run": 0, - "passed": 0, - "failures": 0, - "errors": 0, - "test_results": {} - }} + return { + "code_assertion_results": { + "tests_run": 0, + "passed": 0, + "failures": 0, + "errors": 0, + "test_results": {}, + } + } try: # Use the code_formatter to write assertions to files @@ -55,10 +57,14 @@ def run_tests(self, temp_dir: str, output: Any) -> str: import json # Create a test context with both the model output and input data - test_context = json.dumps({ - "output": output, - "input": output.get("input_data", {}) if isinstance(output, dict) else {} - }) + test_context = json.dumps( + { + "output": output, + "input": ( + output.get("input_data", {}) if isinstance(output, dict) else {} + ), + } + ) # Run the test suite using subprocess and capture the output result = subprocess.run( @@ -114,4 +120,4 @@ def parse_test_results(self, test_output: str) -> Dict[str, Any]: "failures": failures, "errors": errors, "test_results": test_result_dict, - } \ No newline at end of file + } diff --git a/evalforge/code_formatter.py b/evalforge/code_formatter.py index b3494d1..419ddc5 100644 --- a/evalforge/code_formatter.py +++ b/evalforge/code_formatter.py @@ -3,7 +3,7 @@ import os import textwrap from datetime import datetime -from typing import Dict, Optional, Set +from typing import Optional, Set import autopep8 import isort @@ -12,6 +12,7 @@ from evalforge.instructor_models import PythonAssertion + class CodeFormatter(weave.Object): @weave.op def lint_code(self, code: str) -> str: @@ -27,8 +28,8 @@ def lint_code(self, code: str) -> str: code = autopep8.fix_code(code, options={"aggressive": 2}) return code - def get_required_imports(self, tree: ast.AST) -> Set[tuple]: - required_imports = set() + def get_required_imports(self, tree: ast.AST) -> Set[tuple[Optional[str], str]]: + required_imports: Set[tuple[Optional[str], str]] = set() for node in ast.walk(tree): if isinstance(node, ast.Name): if not self.is_builtin(node.id): @@ -61,11 +62,11 @@ def write_assertions_to_files( self, assertions: list[PythonAssertion], base_dir: Optional[str] = None ) -> str: """Write assertions to test files in the specified directory. - + Args: assertions: List of PythonAssertion objects base_dir: Optional directory to write files to. If None, creates a timestamped directory - + Returns: str: Path to the base directory containing the generated files """ @@ -105,7 +106,7 @@ def create_test_file_content(self, assertion_name: str, assertion_code: str) -> dedented_assertion_code = textwrap.dedent(assertion_code).strip() # Re-indent the assertion code to match the class indentation (4 spaces) indented_assertion_code = textwrap.indent(dedented_assertion_code, " ") - + return f"""{imports} class Test_{assertion_name}(OutputTestCase): @@ -161,4 +162,4 @@ def load_tests(loader, standard_tests, pattern): # Exit with non-zero status if there were failures if not unittest.TextTestRunner().run(load_tests(None, None, None)).wasSuccessful(): sys.exit(1) -""" \ No newline at end of file +""" diff --git a/evalforge/combined_scorer.py b/evalforge/combined_scorer.py index 524b18d..c0f5a4d 100644 --- a/evalforge/combined_scorer.py +++ b/evalforge/combined_scorer.py @@ -1,13 +1,11 @@ -import asyncio from pathlib import Path -from typing import Any, Dict, Optional +from typing import Any, Dict, Optional, Union import weave from pydantic import Field from evalforge.code_evaluator import CodeAssertionScorer, CodeFormatter -from evalforge.instructor_models import (Criterion, LLMAssertion, - PythonAssertion) +from evalforge.instructor_models import Criterion, LLMAssertion, PythonAssertion from evalforge.llm import DEFAULT_LLM_MODEL from evalforge.llm_evaluator import LLMAssertionScorer from evalforge.criterion_assertion_map import CriterionAssertionMap @@ -21,12 +19,6 @@ def predict_passthrough( return model_output -from typing import Any, Dict - -import weave - - - class AssertionScorer(weave.Scorer): criterion_assertion_map: CriterionAssertionMap = Field( default_factory=CriterionAssertionMap @@ -73,11 +65,13 @@ async def score( system_prompt=self.system_prompt, ) llm_results = await llm_scorer.score( - model_output=model_output, + model_output=model_output, input_data=input_data, task_description=self.task_description, ) - results["llm_assertion_results"] = llm_results.get("llm_assertion_results", {}) + results["llm_assertion_results"] = llm_results.get( + "llm_assertion_results", {} + ) # Process Python assertions if python_assertions: @@ -86,34 +80,36 @@ async def score( code_formatter=self.code_formatter, ) code_results = code_scorer.score( - model_output=model_output, - input_data=input_data + model_output=model_output, input_data=input_data ) - results["code_assertion_results"] = code_results.get("code_assertion_results", {}).get("test_results", {}) + results["code_assertion_results"] = code_results.get( + "code_assertion_results", {} + ).get("test_results", {}) # Map results back to criteria using the mapping class criterion_results: Dict[str, Dict[str, Any]] = {} for test_name, result in results.get("llm_assertion_results", {}).items(): - criterion = self.criterion_assertion_map.get_criterion_by_assertion(test_name) + criterion = self.criterion_assertion_map.get_criterion_by_assertion( + test_name + ) if criterion not in criterion_results: criterion_results[criterion] = {} - criterion_results[criterion][test_name] = { - "score": result, - "type": "llm" - } + criterion_results[criterion][test_name] = {"score": result, "type": "llm"} for test_name, result in results.get("code_assertion_results", {}).items(): - criterion = self.criterion_assertion_map.get_criterion_by_assertion(test_name) + criterion = self.criterion_assertion_map.get_criterion_by_assertion( + test_name + ) if criterion not in criterion_results: criterion_results[criterion] = {} criterion_results[criterion][test_name] = { "score": result["score"], - "type": "python" + "type": "python", } return criterion_results - def export(self, base_dir: str = "forged_judge"): + def export(self, base_dir: Union[str, Path] = "forged_judge") -> None: base_dir = Path(base_dir) llm_dir = base_dir / "llm_assertions" python_dir = base_dir / "python_assertions" @@ -172,7 +168,12 @@ def import_assertions(self, base_dir: str = "forged_judge"): criterion = item["criterion"] assertion = item["assertion"] self.criterion_assertion_map.add_assertion( - Criterion(criterion=criterion), assertion + Criterion( + criterion=criterion, + explanation="Imported criterion", + evaluation_method="mixed", + ), + assertion, ) def load_assertions_by_criteria(self, base_dir: Path, assertion_cls): diff --git a/evalforge/criterion_assertion_map.py b/evalforge/criterion_assertion_map.py index 097f372..99a07f2 100644 --- a/evalforge/criterion_assertion_map.py +++ b/evalforge/criterion_assertion_map.py @@ -2,8 +2,7 @@ import weave -from evalforge.instructor_models import (Criterion, LLMAssertion, - PythonAssertion) +from evalforge.instructor_models import Criterion, LLMAssertion, PythonAssertion class CriterionAssertionMap(weave.Object): @@ -24,13 +23,14 @@ def get_assertions_by_criterion( ) -> List[Union[LLMAssertion, PythonAssertion]]: return self.criterion_to_assertions.get(criterion_name, []) - def get_criterion_by_assertion(self, assertion_name: str) -> str: + def get_criterion_by_assertion(self, assertion_name: str) -> str | None: + """Get the criterion associated with a given assertion name.""" return self.assertion_to_criterion.get(assertion_name) @classmethod def from_assertions(cls, criterion_assertion_pairs): instance = cls() - for criterion, assertions in criterion_assertion_pairs: - for assertion in assertions: - instance.add_assertion(criterion, assertion) + for assertion in criterion_assertion_pairs: + criterion = Criterion(criterion=assertion.test_name) + instance.add_assertion(criterion, assertion) return instance diff --git a/evalforge/data_utils.py b/evalforge/data_utils.py index 824c373..3c1bcab 100644 --- a/evalforge/data_utils.py +++ b/evalforge/data_utils.py @@ -10,6 +10,7 @@ from evalforge.llm import llm_client from evalforge.instructor_models import DatasetMapping + class DataPoint(BaseModel): input_data: Dict[str, Any] = Field( description="The input data provided to the model for evaluation" @@ -22,15 +23,15 @@ class DataPoint(BaseModel): ) note: Optional[str] = Field( default=None, - description="Optional note providing additional context about the annotation" + description="Optional note providing additional context about the annotation", ) human_description: Optional[str] = Field( default=None, - description="Optional human-provided description of the task or evaluation criteria" + description="Optional human-provided description of the task or evaluation criteria", ) additional_context: Optional[str] = Field( default=None, - description="Optional field for any additional context or metadata" + description="Optional field for any additional context or metadata", ) def format(self, index: Optional[int] = None) -> str: @@ -38,18 +39,20 @@ def format(self, index: Optional[int] = None) -> str: parts = [] if index is not None: parts.append(f"Example {index}:") - - parts.extend([ - "Input:", - json.dumps(self.input_data, indent=2), - "", - "Output:", - json.dumps(self.output_data, indent=2), - "", - f"Annotation: {'Correct' if self.annotation == 1 else 'Incorrect'}", - f"Note: {self.note or 'N/A'}", - "\n" + "-" * 50 + "\n" - ]) + + parts.extend( + [ + "Input:", + json.dumps(self.input_data, indent=2), + "", + "Output:", + json.dumps(self.output_data, indent=2), + "", + f"Annotation: {'Correct' if self.annotation == 1 else 'Incorrect'}", + f"Note: {self.note or 'N/A'}", + "\n" + "-" * 50 + "\n", + ] + ) return "\n".join(parts) def to_dict(self, task_description: Optional[str] = None) -> Dict[str, Any]: @@ -58,26 +61,30 @@ def to_dict(self, task_description: Optional[str] = None) -> Dict[str, Any]: "input_data": self.input_data, "model_output": {"output": self.output_data}, "annotation": self.annotation, - "note": self.note + "note": self.note, } if task_description: result["task_description"] = task_description return result @classmethod - def format_batch(cls, - datapoints: List['DataPoint'], - finalized_task_description: str) -> str: + def format_batch( + cls, datapoints: List["DataPoint"], finalized_task_description: str + ) -> str: """Format a batch of datapoints with optional task description""" parts = [] if finalized_task_description: parts.append(f"Task Description: {finalized_task_description}\n") - + parts.extend(dp.format(i + 1) for i, dp in enumerate(datapoints)) return "\n".join(parts) @classmethod - def from_example(cls, example: Union[Dict[str, Any], List[Any]], mapping: Optional[Dict[str, str]] = None) -> Optional['DataPoint']: + def from_example( + cls, + example: Union[Dict[str, Any], List[Any]], + mapping: Optional[Dict[str, str]] = None, + ) -> Optional["DataPoint"]: """ Create a DataPoint object from a raw example dictionary or list. """ @@ -85,13 +92,21 @@ def from_example(cls, example: Union[Dict[str, Any], List[Any]], mapping: Option # Handle list-structured data if len(example) >= 4: return cls( - input_data={"text": example[0]["input"]} if isinstance(example[0], dict) else {"text": example[0]}, - output_data={"text": example[1]["output"]} if isinstance(example[1], dict) else {"text": example[1]}, + input_data=( + {"text": example[0]["input"]} + if isinstance(example[0], dict) + else {"text": example[0]} + ), + output_data=( + {"text": example[1]["output"]} + if isinstance(example[1], dict) + else {"text": example[1]} + ), annotation=int(example[2]), - note=example[3] + note=example[3], ) return None - + if mapping is None: try: # Attempt to directly parse the example @@ -102,25 +117,33 @@ def from_example(cls, example: Union[Dict[str, Any], List[Any]], mapping: Option return None else: # Use the mapping to create the DataPoint - input_data_key = mapping.get('input_data', None) - output_data_key = mapping.get('output_data', None) - annotation_key = mapping.get('annotation', None) - note_key = mapping.get('note', None) - human_description_key = mapping.get('human_description', None) - additional_context_key = mapping.get('additional_context', None) + input_data_key = mapping.get("input_data", None) + output_data_key = mapping.get("output_data", None) + annotation_key = mapping.get("annotation", None) + note_key = mapping.get("note", None) + human_description_key = mapping.get("human_description", None) + additional_context_key = mapping.get("additional_context", None) input_data = example.get(input_data_key, {}) if input_data_key else {} output_data = example.get(output_data_key, {}) if output_data_key else {} annotation = int(example.get(annotation_key, 0)) if annotation_key else 0 note = example.get(note_key, None) if note_key else None - human_description = example.get(human_description_key, None) if human_description_key else None - additional_context = example.get(additional_context_key, None) if additional_context_key else None + human_description = ( + example.get(human_description_key, None) + if human_description_key + else None + ) + additional_context = ( + example.get(additional_context_key, None) + if additional_context_key + else None + ) # If input_data and output_data are strings, wrap them in dictionaries if isinstance(input_data, str): - input_data = {'text': input_data} + input_data = {"text": input_data} if isinstance(output_data, str): - output_data = {'text': output_data} + output_data = {"text": output_data} return cls( input_data=input_data, @@ -139,20 +162,23 @@ def from_example(cls, example: Union[Dict[str, Any], List[Any]], mapping: Option "annotation": 1, "note": "This is a good response", "human_description": "Task involves evaluating text responses", - "additional_context": "From validation set" + "additional_context": "From validation set", } } } + @weave.op -def generate_mapping(sample: Union[Dict[str, Any], List[Any]], llm_model: str = "gpt-4") -> Dict[str, str]: +def generate_mapping( + sample: Union[Dict[str, Any], List[Any]], llm_model: str = "gpt-4" +) -> Dict[str, str]: """Generate a mapping from dataset columns to DataPoint fields using an LLM.""" logger.info(f"► Using {llm_model} to generate mapping to DataPoint fields") - + # Input validation if sample is None or (isinstance(sample, (list, dict)) and len(sample) == 0): raise ValueError("Sample cannot be None or empty") - + prompt = f""" Given the following data sample: ```json @@ -170,8 +196,11 @@ def generate_mapping(sample: Union[Dict[str, Any], List[Any]], llm_model: str = model=llm_model, response_model=DatasetMapping, messages=[ - {"role": "system", "content": "You are a helpful assistant that creates mappings between dataset fields and DataPoint fields."}, - {"role": "user", "content": prompt} + { + "role": "system", + "content": "You are a helpful assistant that creates mappings between dataset fields and DataPoint fields.", + }, + {"role": "user", "content": prompt}, ], ) mapping = mapping_instruction.model_dump() @@ -181,22 +210,31 @@ def generate_mapping(sample: Union[Dict[str, Any], List[Any]], llm_model: str = logger.error(f"Error generating mapping: {str(e)}") raise ValueError(f"Failed to generate mapping: {str(e)}") -def load_data(data_source: Union[str, Iterable[Dict[str, Any]]], llm_model: str = "gpt-4o") -> List[DataPoint]: + +def load_data( + data_source: Union[str, Iterable[Dict[str, Any]]], llm_model: str = "gpt-4o" +) -> List[DataPoint]: """ Load data from a file path or an iterable of dictionaries and convert it into a list of DataPoint objects. Automatically map dataset columns to DataPoint fields using an LLM if necessary. """ data_points = [] samples = [] - data_source_name = data_source if isinstance(data_source, str) else type(data_source).__name__ + data_source_name = ( + data_source if isinstance(data_source, str) else type(data_source).__name__ + ) logger.rule(f"Loading data from {data_source_name}", color="blue") if isinstance(data_source, str): # Existing logic for loading data from a file path - if data_source.endswith('.json') or data_source.endswith('.jsonl'): - with open(data_source, 'r') as f: - samples = [json.loads(line) for line in f] if data_source.endswith('.jsonl') else json.load(f) - elif data_source.endswith('.csv'): - with open(data_source, newline='', encoding='utf-8') as csvfile: + if data_source.endswith(".json") or data_source.endswith(".jsonl"): + with open(data_source, "r") as f: + samples = ( + [json.loads(line) for line in f] + if data_source.endswith(".jsonl") + else json.load(f) + ) + elif data_source.endswith(".csv"): + with open(data_source, newline="", encoding="utf-8") as csvfile: reader = csv.DictReader(csvfile) samples = [row for row in reader] else: @@ -204,10 +242,10 @@ def load_data(data_source: Union[str, Iterable[Dict[str, Any]]], llm_model: str else: # New logic for handling an iterable of data samples = list(data_source) - + if not samples: raise ValueError("The dataset is empty.") - + # Try to model_validate the first sample directly first_sample = samples[0] data_point = DataPoint.from_example(first_sample) @@ -225,4 +263,4 @@ def load_data(data_source: Union[str, Iterable[Dict[str, Any]]], llm_model: str if data_point: data_points.append(data_point) logger.info(f"Loaded {len(data_points)} datapoints") - return data_points \ No newline at end of file + return data_points diff --git a/evalforge/forge.py b/evalforge/forge.py index 6a64b3c..d0abf84 100644 --- a/evalforge/forge.py +++ b/evalforge/forge.py @@ -1,20 +1,24 @@ -import asyncio from typing import Any, Dict, List, Optional, Tuple from jinja2 import Template import random import weave -from litellm import acompletion from evalforge.combined_scorer import AssertionScorer from evalforge.criterion_assertion_map import CriterionAssertionMap -from evalforge.alignment import (calculate_alignment_metrics, - filter_assertion_results, - format_alignment_metrics, - select_best_assertions, - select_best_criteria) -from evalforge.instructor_models import (CombinedTaskDescription, Criterion, - CriterionAssertions, - EvaluationCriteria, TaskDescription) +from evalforge.alignment import ( + calculate_alignment_metrics, + filter_assertion_results, + format_alignment_metrics, + select_best_assertions, + select_best_criteria, +) +from evalforge.instructor_models import ( + CombinedTaskDescription, + Criterion, + CriterionAssertions, + EvaluationCriteria, + TaskDescription, +) from evalforge.llm import llm_aclient, DEFAULT_LLM_MODEL from evalforge.prompts import ( TASK_PROMPT, @@ -51,16 +55,19 @@ class EvalForge(weave.Model, Serializable): def shuffle_and_batch_data(self, data: List[DataPoint]) -> List[List[DataPoint]]: "Shuffle and batch the data into smaller lists of datapoints" shuffled_data = random.sample(data, len(data)) - return [shuffled_data[i:i+self.batch_size] for i in range(0, len(shuffled_data), self.batch_size)] + return [ + shuffled_data[i : i + self.batch_size] + for i in range(0, len(shuffled_data), self.batch_size) + ] def format_samples(self, batch: List[DataPoint]) -> List[Dict[str, Any]]: # Helper method to format samples return [ { - 'input_data': dp.input_data, - 'output_data': dp.output_data, - 'annotation': dp.annotation, - 'note': dp.note + "input_data": dp.input_data, + "output_data": dp.output_data, + "annotation": dp.annotation, + "note": dp.note, } for dp in batch ] @@ -69,22 +76,21 @@ def format_samples(self, batch: List[DataPoint]) -> List[Dict[str, Any]]: async def get_task_description(self, data: List[DataPoint]) -> str: batched_data = self.shuffle_and_batch_data(data) task_description = "" - + for batch in tqdm(batched_data, desc="Refining task description"): samples = self.format_samples(batch) template = Template(self.task_prompt) formatted_prompt = template.render( - task_description=task_description, - samples=samples + task_description=task_description, samples=samples ) response = await llm_aclient.chat.completions.create( model=self.llm_model, messages=[ {"role": "system", "content": self.task_system_prompt}, - {"role": "user", "content": formatted_prompt} + {"role": "user", "content": formatted_prompt}, ], - response_model=TaskDescription + response_model=TaskDescription, ) task_description = response.description @@ -124,12 +130,12 @@ async def process_criteria( self, data: List[DataPoint], all_criteria: str, finalized_task_description: str ) -> EvaluationCriteria: formatted_data = DataPoint.format_batch(data, finalized_task_description) - + prompt = self.criteria_prompt.format( formatted_data=formatted_data, generated_criteria=str([c.model_dump() for c in all_criteria]), ) - + response = await llm_aclient.chat.completions.create( model=self.llm_model, messages=[ @@ -145,16 +151,21 @@ async def generate_criteria( self, data: List[DataPoint], finalized_task_description: str ) -> List[Criterion]: all_criteria = [] - + for _ in tqdm(range(self.num_criteria_to_generate), desc="Generating criteria"): - response = await self.process_criteria(data, all_criteria, finalized_task_description) + response = await self.process_criteria( + data, all_criteria, finalized_task_description + ) all_criteria.extend(response.criteria) return all_criteria @weave.op async def create_candidate_assertions( - self, data: List[DataPoint], criterion: Criterion, finalized_task_description: str + self, + data: List[DataPoint], + criterion: Criterion, + finalized_task_description: str, ) -> CriterionAssertions: formatted_data = DataPoint.format_batch(data, finalized_task_description) prompt = self.candidate_assertion_prompt.format( @@ -172,7 +183,9 @@ async def create_candidate_assertions( return response @weave.op - async def generate_all_assertions(self, criteria, data: List[DataPoint], finalized_task_description: str): + async def generate_all_assertions( + self, criteria, data: List[DataPoint], finalized_task_description: str + ): async def process_criterion(criterion): candidate_assertions = await self.create_candidate_assertions( data, criterion, finalized_task_description @@ -182,7 +195,7 @@ async def process_criterion(criterion): # Create list of coroutines coros = [process_criterion(criterion) for criterion in criteria] - + # Use tqdm_gather to run coroutines concurrently with progress bar results = await tqdm_gather(coros, desc="Generating assertions") @@ -191,9 +204,7 @@ async def process_criterion(criterion): @weave.op async def run_assertions( - self, - scorer: AssertionScorer, - data: List[DataPoint] + self, scorer: AssertionScorer, data: List[DataPoint] ) -> Dict[str, Dict[str, List[Tuple[int, int]]]]: criterion_assertion_results = {} @@ -249,7 +260,7 @@ async def create_and_evaluate_scorers( all_assertions: CriterionAssertionMap, train_data: List[DataPoint], criteria: List[Criterion], - finalized_task_description: str + finalized_task_description: str, ) -> Tuple[Dict, Dict]: """Creates and evaluates both initial and final scorers in one cohesive flow""" # Create initial scorer @@ -262,7 +273,8 @@ async def create_and_evaluate_scorers( # Run assertions and calculate initial metrics assertion_results = await self.run_assertions(initial_scorer, train_data) initial_metrics = calculate_alignment_metrics(assertion_results) - + format_alignment_metrics(initial_metrics, title="Initial alignment metrics") + if not initial_metrics: logger.warning("No metrics calculated from assertion results") return {}, {} @@ -273,7 +285,9 @@ async def create_and_evaluate_scorers( assertion_results, num_assertions_per_criterion=self.num_assertions_per_criterion, ) - filtered_assertion_results = filter_assertion_results(assertion_results, best_assertions) + filtered_assertion_results = filter_assertion_results( + assertion_results, best_assertions + ) filtered_metrics = calculate_alignment_metrics(filtered_assertion_results) if not filtered_metrics: @@ -296,7 +310,6 @@ async def create_and_evaluate_scorers( # Format metrics summaries format_alignment_metrics(filtered_metrics, title="Final alignment metrics") - format_alignment_metrics(initial_metrics, title="Initial alignment metrics") return ( { @@ -308,32 +321,36 @@ async def create_and_evaluate_scorers( "judge": initial_scorer, "alignment_metrics": initial_metrics, "assertion_results": assertion_results, - } + }, ) @weave.op async def fit(self, train_data: List[DataPoint]) -> Dict[str, Any]: logger.rule("Forging judge", color="blue") - + with logger.timer("Generating task description"): llm_task_description = await self.get_task_description(train_data) - + with logger.timer("Combining human and LLM descriptions"): finalized_task_description = await self.combine_human_and_llm_descriptions( train_data, llm_task_description ) - + with logger.timer("Generating evaluation criteria"): - criteria = await self.generate_criteria(train_data, finalized_task_description) - + criteria = await self.generate_criteria( + train_data, finalized_task_description + ) + with logger.timer("Generating assertions"): - all_assertions = await self.generate_all_assertions(criteria, train_data, finalized_task_description) - + all_assertions = await self.generate_all_assertions( + criteria, train_data, finalized_task_description + ) + with logger.timer("Creating and evaluating scorers"): forged_judges, initial_judges = await self.create_and_evaluate_scorers( all_assertions, train_data, criteria, finalized_task_description ) - + logger.header("EvalForge pipeline completed ✨") logger.rule("Finalized task description", color="blue") logger.info(finalized_task_description) diff --git a/evalforge/instructor_models.py b/evalforge/instructor_models.py index ade3f20..f51e7bc 100644 --- a/evalforge/instructor_models.py +++ b/evalforge/instructor_models.py @@ -1,4 +1,4 @@ -from typing import List, Literal, Union, Dict +from typing import List, Literal, Union from pydantic import BaseModel, Field @@ -46,8 +46,8 @@ def __eq__(self, other): class EvaluationCriteria(BaseModel): criteria: List[Criterion] = Field( ..., - min_items=1, - max_items=2, + min_length=1, + max_length=2, description="A list of 1-2 distinct evaluation criteria, each focusing on a different aspect of output quality", ) @@ -78,8 +78,8 @@ class LLMAssertion(BaseModel): class CriterionAssertions(BaseModel): assertions: List[Union[PythonAssertion, LLMAssertion]] = Field( ..., - min_items=1, - max_items=3, + min_length=1, + max_length=3, description="Generate 1-3 specific, testable assertions that can be used to evaluate LLM outputs based on the given criterion", ) @@ -87,12 +87,24 @@ class CriterionAssertions(BaseModel): class AssertionEvaluation(BaseModel): result: Literal["PASS", "FAIL"] = Field( ..., - description="The evaluation result of an assertion. Must be either 'PASS' or 'FAIL'." + description="The evaluation result of an assertion. Must be either 'PASS' or 'FAIL'.", ) class DatasetMapping(BaseModel): - input_data: str = Field(..., description="The key in the sample data that corresponds to the input_data field in the DataPoint object") - output_data: str = Field(..., description="The key in the sample data that corresponds to the output_data field in the DataPoint object") - annotation: str = Field(..., description="The key in the sample data that corresponds to the annotation field in the DataPoint object") - note: str = Field(..., description="The key in the sample data that corresponds to the note field in the DataPoint object") + input_data: str = Field( + ..., + description="The key in the sample data that corresponds to the input_data field in the DataPoint object", + ) + output_data: str = Field( + ..., + description="The key in the sample data that corresponds to the output_data field in the DataPoint object", + ) + annotation: str = Field( + ..., + description="The key in the sample data that corresponds to the annotation field in the DataPoint object", + ) + note: str = Field( + ..., + description="The key in the sample data that corresponds to the note field in the DataPoint object", + ) diff --git a/evalforge/llm.py b/evalforge/llm.py index c932b2b..7c96026 100644 --- a/evalforge/llm.py +++ b/evalforge/llm.py @@ -5,6 +5,7 @@ from evalforge.utils import sanitize_messages + # we need this to fix litellm+weave bug def sanitize_completion(func): @wraps(func) @@ -12,13 +13,13 @@ async def async_wrapper(*args, **kwargs): if "messages" in kwargs: kwargs["messages"] = sanitize_messages(kwargs["messages"]) return await func(*args, **kwargs) - + @wraps(func) def sync_wrapper(*args, **kwargs): if "messages" in kwargs: kwargs["messages"] = sanitize_messages(kwargs["messages"]) return func(*args, **kwargs) - + return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper @@ -28,4 +29,3 @@ def sync_wrapper(*args, **kwargs): # Default model configurations DEFAULT_LLM_MODEL = "gpt-4o" # For high accuracy tasks DEFAULT_FAST_MODEL = "gpt-4o-mini" # For faster, lighter tasks - diff --git a/evalforge/llm_evaluator.py b/evalforge/llm_evaluator.py index 8e554da..00e9438 100644 --- a/evalforge/llm_evaluator.py +++ b/evalforge/llm_evaluator.py @@ -1,14 +1,14 @@ import asyncio -from typing import Any, Dict, List, Optional, Tuple -from pydantic import BaseModel +from typing import Any, Dict, List, Tuple import weave from pydantic import Field from evalforge.instructor_models import LLMAssertion, AssertionEvaluation -from evalforge.llm import llm_aclient, DEFAULT_LLM_MODEL +from evalforge.llm import llm_aclient from evalforge.prompts import LLMASSERTION_PROMPT_TEMPLATE, LLMASSERTION_SYSTEM_PROMPT + class LLMAssertionScorer(weave.Scorer): assertions: List[LLMAssertion] = Field(default_factory=list) model: str = Field(default="gpt-4") @@ -61,4 +61,4 @@ async def score( ] assertion_results = await asyncio.gather(*tasks) - return {"llm_assertion_results": dict(assertion_results)} \ No newline at end of file + return {"llm_assertion_results": dict(assertion_results)} diff --git a/evalforge/prompts.py b/evalforge/prompts.py index 4c8847e..c0d675d 100644 --- a/evalforge/prompts.py +++ b/evalforge/prompts.py @@ -1,7 +1,8 @@ import textwrap # Task-related prompts -TASK_PROMPT = textwrap.dedent(""" +TASK_PROMPT = textwrap.dedent( + """ Current task description: {{ task_description }} New datapoints: @@ -26,13 +27,17 @@ 3. Any formatting or style requirements 4. Evaluation criteria (based on the annotations and notes) - Keep the description concise yet comprehensive.""") + Keep the description concise yet comprehensive.""" +) -TASK_SYSTEM_PROMPT = textwrap.dedent(""" +TASK_SYSTEM_PROMPT = textwrap.dedent( + """ You are an AI assistant designed to help refine task descriptions for a given dataset. - """) + """ +) -COMBINED_TASK_PROMPT = textwrap.dedent(""" +COMBINED_TASK_PROMPT = textwrap.dedent( + """ LLM-generated task description: {llm_description} @@ -46,14 +51,18 @@ 4. The description maintains a professional tone. 5. It provides a complete picture of the task requirements and evaluation criteria. - Please provide the combined description in a single, well-structured paragraph.""") + Please provide the combined description in a single, well-structured paragraph.""" +) -COMBINED_TASK_SYSTEM_PROMPT = textwrap.dedent(""" +COMBINED_TASK_SYSTEM_PROMPT = textwrap.dedent( + """ You are an AI assistant designed to help refine task descriptions for a given dataset given a LLM-generated task description and additional human-provided context. - """) + """ +) # Criteria-related prompts -CRITERIA_PROMPT = textwrap.dedent(""" +CRITERIA_PROMPT = textwrap.dedent( + """ Analyze the following annotated datapoints: {formatted_data} @@ -76,14 +85,18 @@ [Criterion]: [Brief explanation and evaluation method] Aim for a mix of straightforward, code-evaluable criteria and more nuanced criteria that might require LLM or human evaluation. - """) + """ +) -CRITERIA_SYSTEM_PROMPT = textwrap.dedent(""" +CRITERIA_SYSTEM_PROMPT = textwrap.dedent( + """ You are an AI assistant designed to create evaluation criteria for a given task. - """) + """ +) # Assertion-related prompts -CANDIDATE_ASSERTION_PROMPT = textwrap.dedent(""" +CANDIDATE_ASSERTION_PROMPT = textwrap.dedent( + """ Given the following evaluation criterion and annotated data, generate 1-3 specific, testable assertions: Criterion: {criterion} @@ -109,14 +122,18 @@ 6. Aim for assertions that could be applied across multiple types of outputs Ensure that your assertions are directly evaluable and avoid vague or subjective language. Focus on creating assertions that align with human preferences and can be used to validate the quality of LLM-generated evaluations. - """) + """ +) -CANDIDATE_ASSERTION_SYSTEM_PROMPT = textwrap.dedent(""" +CANDIDATE_ASSERTION_SYSTEM_PROMPT = textwrap.dedent( + """ You are an AI assistant designed to create testable assertions for a given task and criterion. - """) + """ +) # LLM Assertion Scorer prompts -LLMASSERTION_PROMPT_TEMPLATE = textwrap.dedent(""" +LLMASSERTION_PROMPT_TEMPLATE = textwrap.dedent( + """ Task Description: {task_description} @@ -133,6 +150,7 @@ Consider the task description and input when evaluating the output against the assertion. Respond with either 'PASS' if the output meets the assertion criteria in the context of the task and input, or 'FAIL' if it does not. -""") +""" +) -LLMASSERTION_SYSTEM_PROMPT = "You are an AI assistant evaluating the quality of text outputs based on given tasks, inputs, and assertions." \ No newline at end of file +LLMASSERTION_SYSTEM_PROMPT = "You are an AI assistant evaluating the quality of text outputs based on given tasks, inputs, and assertions." diff --git a/evalforge/utils.py b/evalforge/utils.py index 5fffb3a..a013b54 100644 --- a/evalforge/utils.py +++ b/evalforge/utils.py @@ -7,7 +7,7 @@ from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn import time import functools -from typing import Callable, Any +from typing import Callable, Any, Optional import asyncio import pprint as pp from contextlib import contextmanager @@ -15,9 +15,10 @@ # Add global console instance console = Console() + def pprint(d, indent=4, width=100): """Pretty print a dictionary or other object with line width control. - + Args: d: Dictionary or object to print indent: Number of spaces for indentation @@ -27,22 +28,38 @@ def pprint(d, indent=4, width=100): printer = pp.PrettyPrinter(indent=indent, width=width) printer.pprint(d) + def load_jsonl(filename: Path | str) -> list[dict]: """Load a JSONL file into a list of dictionaries.""" with open(filename, "r") as file: return [json.loads(line) for line in file] - + + class BaseModelEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, BaseModel): return obj.model_dump() return super().default(obj) + class NumpyEncoder(json.JSONEncoder): def default(self, obj): - if isinstance(obj, (np.int_, np.intc, np.intp, np.int8, - np.int16, np.int32, np.int64, np.uint8, - np.uint16, np.uint32, np.uint64)): + if isinstance( + obj, + ( + np.int_, + np.intc, + np.intp, + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + ), + ): return int(obj) elif isinstance(obj, (np.float16, np.float32, np.float64)): return float(obj) @@ -53,7 +70,8 @@ def default(self, obj): elif isinstance(obj, BaseModel): return obj.model_dump() return super().default(obj) - + + class SuperEncoder(BaseModelEncoder, NumpyEncoder): pass @@ -65,38 +83,37 @@ def save_jsonl(data: list[dict], filename: Path | str): json.dump(example, file, cls=SuperEncoder) file.write("\n") -def listify(l: list[str]) -> str: + +def listify(listable: list[str]) -> str: """Creates a markdown list of the items in the list.""" - if not l: + if not listable: return "- None" - return "\n".join([f"- {item}" for item in l]) + return "\n".join([f"- {item}" for item in listable]) + def sanitize_messages(messages: list[dict[str, str]]) -> list[dict[str, str]]: """ Safely process messages for LiteLLM by converting all content to plain strings. This prevents issues with class attributes and non-pickleable objects. - + Args: messages: List of message dictionaries with 'role' and 'content' keys Returns: List of sanitized message dictionaries """ return [ - { - "role": str(msg["role"]), - "content": str(msg["content"]) - } - for msg in messages + {"role": str(msg["role"]), "content": str(msg["content"])} for msg in messages ] -async def tqdm_gather(coros, desc: str = None, total: int = None): + +async def tqdm_gather(coros, desc: Optional[str] = None, total: Optional[int] = None): """Create a Rich progress bar for gathering multiple coroutines - + Args: coros: List of coroutines to execute concurrently desc: Description for the progress bar total: Total number of steps (defaults to len(coros) if not provided) - + Returns: List of results from the gathered coroutines """ @@ -105,19 +122,19 @@ async def tqdm_gather(coros, desc: str = None, total: int = None): *Progress.get_default_columns(), TimeElapsedColumn(), console=console, - transient=True + transient=True, ) - + if total is None: total = len(coros) - + task_id = progress.add_task(f"[bold blue]{desc}", total=total) - + async def wrapped_coro(coro): result = await coro progress.update(task_id, advance=1) return result - + progress.start() try: results = await asyncio.gather(*[wrapped_coro(coro) for coro in coros]) @@ -125,6 +142,7 @@ async def wrapped_coro(coro): finally: progress.stop() + # Keep the original tqdm for synchronous operations def tqdm(iterable=None, desc: str = None, total: int = None): """Create a Rich progress bar for synchronous operations""" @@ -133,12 +151,12 @@ def tqdm(iterable=None, desc: str = None, total: int = None): *Progress.get_default_columns(), TimeElapsedColumn(), console=console, - transient=True + transient=True, ) # Use provided total or calculate from coroutines if total is None: total = len(iterable) - + task_id = progress.add_task(f"[bold blue]{desc}", total=total) progress.start() try: @@ -148,16 +166,18 @@ def tqdm(iterable=None, desc: str = None, total: int = None): finally: progress.stop() + def timer(func: Callable) -> Callable: """Decorator that measures and prints execution time of functions. Works with both async and regular functions. - + Args: func: The function to be timed - + Returns: Wrapped function that prints its execution time """ + @functools.wraps(func) async def async_wrapper(*args, **kwargs) -> Any: start = time.perf_counter() @@ -165,7 +185,7 @@ async def async_wrapper(*args, **kwargs) -> Any: elapsed = time.perf_counter() - start console.print(f"[dim](time: {elapsed:.2f}s)[/]") return result - + @functools.wraps(func) def sync_wrapper(*args, **kwargs) -> Any: start = time.perf_counter() @@ -173,20 +193,21 @@ def sync_wrapper(*args, **kwargs) -> Any: elapsed = time.perf_counter() - start console.print(f"[dim](time: {elapsed:.2f}s)[/]") return result - + return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper + class Logger: def __init__(self): self.console = Console() def rule(self, name: str, color: str = "green") -> None: self.console.rule(f"[bold {color}]Begin {name}") - + def info(self, message: str): """Print an info message""" self.console.print(f"[green]► {message}[/]") - + def warning(self, message: str): """Print a warning message""" self.console.print(f"[yellow]► {message}[/]") @@ -194,21 +215,21 @@ def warning(self, message: str): def error(self, message: str): """Print an error message""" self.console.print(f"[red]► {message}[/]") - + def header(self, message: str): """Print a header message""" self.console.print(f"[bold blue]{message}[/]") - + @contextmanager def timer(self, message: str = None): """Context manager for timing operations - + Args: message: Optional message to print before timing """ if message: self.info(message) - + start = time.perf_counter() try: yield @@ -216,5 +237,6 @@ def timer(self, message: str = None): elapsed = time.perf_counter() - start self.console.print(f"[dim](time: {elapsed:.2f}s)[/]") + # Create global logger instance -logger = Logger() \ No newline at end of file +logger = Logger() diff --git a/forge_medical.py b/forge_medical.py index ef759ee..7a1bde2 100644 --- a/forge_medical.py +++ b/forge_medical.py @@ -1,4 +1,3 @@ - import asyncio import random from pathlib import Path @@ -7,12 +6,15 @@ from evalforge.utils import logger, pprint from evalforge.forge import EvalForge -from evalforge.data_utils import load_data +from evalforge.data_utils import load_data, DataPoint +from evalforge.combined_scorer import AssertionScorer from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics weave.init("evalforge_test_judgebench") -ds_formatted = weave.ref("weave:///a-sh0ts/medical_data_results/object/medical_data_annotations:7GcCtWgyPTWtKY48Z7v5VxwCNZXTTTpSMbmubAbyHT8").get() +ds_formatted = weave.ref( + "weave:///a-sh0ts/medical_data_results/object/medical_data_annotations:7GcCtWgyPTWtKY48Z7v5VxwCNZXTTTpSMbmubAbyHT8" +).get() data = random.sample(ds_formatted, 10) # def to_datapoint(d): @@ -33,21 +35,28 @@ NUM_CRITERIA_TO_GENERATE = 3 forger = EvalForge( - batch_size=BATCH_SIZE, - num_criteria_to_generate=NUM_CRITERIA_TO_GENERATE, - llm_model=LLM_MODEL + batch_size=BATCH_SIZE, + num_criteria_to_generate=NUM_CRITERIA_TO_GENERATE, + llm_model=LLM_MODEL, ) results = asyncio.run(forger.fit(formatted_data)) forged_judge = results["forged_judges"]["judge"] +logger.info(f"Forged judge: {forged_judge.model_dump()}") + logger.rule("Running assertions and calculating metrics", color="blue") + @weave.op -async def run_assertions_and_calculate_metrics(forger, judge, data): +async def run_assertions_and_calculate_metrics( + forger: EvalForge, judge: AssertionScorer, data: list[DataPoint] +): all_data_forged_judge_assertion_results = await forger.run_assertions(judge, data) - all_data_metrics = calculate_alignment_metrics(all_data_forged_judge_assertion_results) + all_data_metrics = calculate_alignment_metrics( + all_data_forged_judge_assertion_results + ) format_alignment_metrics(all_data_metrics) return -asyncio.run(run_assertions_and_calculate_metrics( - forger, forged_judge, formatted_data)) \ No newline at end of file + +asyncio.run(run_assertions_and_calculate_metrics(forger, forged_judge, formatted_data)) diff --git a/forge_mini.py b/forge_mini.py index 9b9647d..4b311d7 100644 --- a/forge_mini.py +++ b/forge_mini.py @@ -1,4 +1,3 @@ - import asyncio from evalforge.utils import logger @@ -7,41 +6,42 @@ from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics import weave + weave.init("evalforge_test_judgebench") # train_ds_formatted = [ # DataPoint( -# input_data={"text": "1+1="}, -# output_data={"text": "2"}, -# annotation=1, +# input_data={"text": "1+1="}, +# output_data={"text": "2"}, +# annotation=1, # note="Correct summation", -# ), +# ), # DataPoint( -# input_data={"text": "1+1="}, -# output_data={"text": "3"}, -# annotation=0, +# input_data={"text": "1+1="}, +# output_data={"text": "3"}, +# annotation=0, # note="Incorrect summation", # ), # DataPoint( -# input_data={"text": "What is the square root of 16?"}, -# output_data={"text": "4"}, -# annotation=1, +# input_data={"text": "What is the square root of 16?"}, +# output_data={"text": "4"}, +# annotation=1, # note="Correct square root", # ), # ] # eval_ds_formatted = [ # DataPoint( -# input_data={"text": "What is the square root of 16?"}, -# output_data={"text": "4"}, -# annotation=1, +# input_data={"text": "What is the square root of 16?"}, +# output_data={"text": "4"}, +# annotation=1, # note="Correct square root", # ), # DataPoint( -# input_data={"text": "What is the square root of 16?"}, -# output_data={"text": "3"}, -# annotation=0, +# input_data={"text": "What is the square root of 16?"}, +# output_data={"text": "3"}, +# annotation=0, # note="Incorrect square root", # ), # ] @@ -58,12 +58,17 @@ logger.rule("Running assertions and calculating metrics", color="blue") + @weave.op async def run_assertions_and_calculate_metrics(forger, judge, data): all_data_forged_judge_assertion_results = await forger.run_assertions(judge, data) - all_data_metrics = calculate_alignment_metrics(all_data_forged_judge_assertion_results) + all_data_metrics = calculate_alignment_metrics( + all_data_forged_judge_assertion_results + ) format_alignment_metrics(all_data_metrics) return -asyncio.run(run_assertions_and_calculate_metrics( - forger, forged_judge, eval_ds_formatted)) \ No newline at end of file + +asyncio.run( + run_assertions_and_calculate_metrics(forger, forged_judge, eval_ds_formatted) +) diff --git a/tests/test_code_formatter.py b/tests/test_code_formatter.py index 1e114ef..c6f3d63 100644 --- a/tests/test_code_formatter.py +++ b/tests/test_code_formatter.py @@ -5,57 +5,66 @@ from evalforge.code_formatter import CodeFormatter from evalforge.instructor_models import PythonAssertion + @pytest.fixture def code_formatter(): return CodeFormatter() + @pytest.fixture def sample_assertions(): return { - "within_word_limit": textwrap.dedent(""" + "within_word_limit": textwrap.dedent( + """ def test_within_word_limit(self): # Count words in output total_words = sum(len(str(value).split()) for value in self.output['output'].split('\\n')) self.assertLessEqual(total_words, 150, f"Output exceeds word limit with {total_words} words.") - """).strip(), - "essential_information_inclusion": textwrap.dedent(""" + """ + ).strip(), + "essential_information_inclusion": textwrap.dedent( + """ def test_essential_information_inclusion(self): # Check for the presence of essential keys essential_keys = ['chief complaint', 'history of present illness', 'physical examination'] output_text = self.output['output'].lower() for key in essential_keys: self.assertIn(key, output_text, f"Output is missing essential information: {key}.") - """).strip(), + """ + ).strip(), } + def test_lint_code(code_formatter): - sample_code = textwrap.dedent(""" + sample_code = textwrap.dedent( + """ def test_function(self): output_text = self.output['output'] self.assertIsInstance(output_text, str) self.assertTrue(output_text.strip( )) - """).strip() + """ + ).strip() formatted_code = code_formatter.lint_code(sample_code) # Verify no syntax errors in formatted code ast.parse(formatted_code) # Check basic formatting assert "strip()" in formatted_code # removed extra spaces + def test_write_assertions_to_files(code_formatter, sample_assertions, tmp_path): # Write assertions to files base_dir = code_formatter.write_assertions_to_files( - [PythonAssertion( - test_name=name, - code=code, - evaluation_type="python" - ) for name, code in sample_assertions.items()], - base_dir=str(tmp_path) + [ + PythonAssertion(test_name=name, code=code, evaluation_type="python") + for name, code in sample_assertions.items() + ], + base_dir=str(tmp_path), ) - + # Check if test files are created with correct content for assertion_name in sample_assertions: test_file = os.path.join(base_dir, "tests", f"test_{assertion_name}.py") assert os.path.exists(test_file) - with open(test_file, 'r') as f: + with open(test_file, "r") as f: content = f.read() - assert f"class Test_{assertion_name}(OutputTestCase):" in content \ No newline at end of file + assert f"class Test_{assertion_name}(OutputTestCase):" in content diff --git a/tests/test_combined_scorer.py b/tests/test_combined_scorer.py index 264d847..cf8d887 100644 --- a/tests/test_combined_scorer.py +++ b/tests/test_combined_scorer.py @@ -3,7 +3,7 @@ import weave from evalforge.combined_scorer import AssertionScorer, predict_passthrough from evalforge.criterion_assertion_map import CriterionAssertionMap -from evalforge.instructor_models import LLMAssertion, PythonAssertion +from evalforge.instructor_models import LLMAssertion, PythonAssertion, Criterion weave.init("combined_scorer_test") @@ -27,6 +27,7 @@ "bullet points starting with the key. Based on this assessment, respond with 'PASS' if all criteria " "are met, otherwise 'FAIL'." ), + evaluation_type="llm", ), LLMAssertion( test_name="conciseness_and_privacy_compliance", @@ -35,6 +36,7 @@ "key information effectively within 150 words while ensuring no personal identifiable information (PII) " "like name, age, gender, or ID is present? Provide your assessment as PASS for compliance or FAIL otherwise." ), + evaluation_type="llm", ), # Python Assertions PythonAssertion( @@ -53,6 +55,7 @@ def test_essential_information_inclusion(self): for key in essential_keys: self.assertIn(key, output_text, f"Output is missing essential information: {key}.") """, + evaluation_type="python", ), PythonAssertion( test_name="no_excessive_information", @@ -63,9 +66,23 @@ def test_no_excessive_information(self): for term in disallowed_terms: self.assertNotIn(term, output_text, f"Output contains disallowed information: {term}.") """, + evaluation_type="python", ), ] +# Create criteria and map assertions to them +criterion_assertion_map = CriterionAssertionMap() +criteria = [ + Criterion(criterion="Completeness and Accuracy", evaluation_method="llm"), + Criterion(criterion="Privacy and Formatting", evaluation_method="llm"), +] + +# Map assertions to criteria +criterion_assertion_map.add_assertion(criteria[0], assertions[0]) +criterion_assertion_map.add_assertion(criteria[0], assertions[2]) +criterion_assertion_map.add_assertion(criteria[1], assertions[1]) +criterion_assertion_map.add_assertion(criteria[1], assertions[3]) + # Examples examples = [ # Example 1 @@ -164,12 +181,14 @@ def test_no_excessive_information(self): }, ] + @pytest.mark.asyncio async def test_combined_scorer(): - # Initialize the AssertionScorer with the assertions + # Initialize the AssertionScorer with the mapped assertions scorer = AssertionScorer( - assertions=CriterionAssertionMap.from_assertions(assertions), + criterion_assertion_map=criterion_assertion_map, llm_model="gpt-4o", + task_description="Transform a dialogue between a doctor and a patient into a structured medical note summary.", prompt_template=""" Task Description: {task_description} @@ -198,3 +217,24 @@ async def test_combined_scorer(): results = await evaluation.evaluate(predict_passthrough) + # Verify the structure of results + assert results is not None + assert "AssertionScorer" in results + scorer_results = results["AssertionScorer"] + + # Check for criteria presence + assert "Completeness and Accuracy" in scorer_results + assert "Privacy and Formatting" in scorer_results + + # Check the structure of each criterion's results + for criterion in ["Completeness and Accuracy", "Privacy and Formatting"]: + criterion_results = scorer_results[criterion] + assert len(criterion_results) > 0 + + for assertion_name, result in criterion_results.items(): + assert "score" in result + assert isinstance(result["score"], dict) + assert "mean" in result["score"] + + # Optionally verify that model latency is present + assert "model_latency" in results diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py index 022b443..d7f8b93 100644 --- a/tests/test_data_utils.py +++ b/tests/test_data_utils.py @@ -2,24 +2,31 @@ from evalforge.data_utils import generate_mapping from evalforge.instructor_models import DatasetMapping + @pytest.fixture def list_sample(): return [ {"input": "What is the capital of France?"}, - {"output": "Paris is the capital of France.", "score": 1, "feedback": "Good answer, accurate and concise."}, + { + "output": "Paris is the capital of France.", + "score": 1, + "feedback": "Good answer, accurate and concise.", + }, 1, - "Good answer, accurate and concise." + "Good answer, accurate and concise.", ] + @pytest.fixture def dict_sample(): return { "question": "What is the capital of France?", "response": "Paris is the capital of France.", "score": 1, - "feedback": "Good answer, accurate and concise." + "feedback": "Good answer, accurate and concise.", } + @pytest.fixture def nested_dict_sample(): return { @@ -28,11 +35,12 @@ def nested_dict_sample(): "model_output": "Paris is the capital of France.", "evaluation": { "is_correct": 1, - "reviewer_notes": "Good answer, accurate and concise." - } + "reviewer_notes": "Good answer, accurate and concise.", + }, } } + def test_generate_mapping_list_structure(list_sample): mapping = generate_mapping(list_sample) assert isinstance(mapping, dict) @@ -40,35 +48,45 @@ def test_generate_mapping_list_structure(list_sample): assert "output_data" in mapping assert "annotation" in mapping assert "note" in mapping - + # Test the mapping works with DataPoint.from_example from evalforge.data_utils import DataPoint + data_point = DataPoint.from_example(list_sample, mapping) assert data_point is not None assert data_point.annotation in [0, 1] + def test_generate_mapping_dict_structure(dict_sample): mapping = generate_mapping(dict_sample) assert isinstance(mapping, dict) - assert all(key in mapping for key in ["input_data", "output_data", "annotation", "note"]) - + assert all( + key in mapping for key in ["input_data", "output_data", "annotation", "note"] + ) + # Verify the mapping matches expected structure from evalforge.data_utils import DataPoint + data_point = DataPoint.from_example(dict_sample, mapping) assert data_point is not None assert data_point.annotation in [0, 1] + def test_generate_mapping_nested_structure(nested_dict_sample): mapping = generate_mapping(nested_dict_sample) assert isinstance(mapping, dict) - assert all(key in mapping for key in ["input_data", "output_data", "annotation", "note"]) - + assert all( + key in mapping for key in ["input_data", "output_data", "annotation", "note"] + ) + # Verify the mapping works with nested structures from evalforge.data_utils import DataPoint + data_point = DataPoint.from_example(nested_dict_sample, mapping) assert data_point is not None assert data_point.annotation in [0, 1] + def test_generate_mapping_invalid_input(): with pytest.raises(Exception): generate_mapping(None) @@ -77,13 +95,14 @@ def test_generate_mapping_invalid_input(): with pytest.raises(Exception): generate_mapping({}) + def test_mapping_model_validation(): # Test that the mapping follows DatasetMapping model sample_mapping = { "input_data": "question", "output_data": "response", "annotation": "score", - "note": "feedback" + "note": "feedback", } mapping_model = DatasetMapping(**sample_mapping) - assert mapping_model.model_dump() == sample_mapping \ No newline at end of file + assert mapping_model.model_dump() == sample_mapping diff --git a/tests/test_llm_evaluator.py b/tests/test_llm_evaluator.py index 7d69c1e..0b62760 100644 --- a/tests/test_llm_evaluator.py +++ b/tests/test_llm_evaluator.py @@ -2,6 +2,7 @@ from evalforge.llm_evaluator import LLMAssertionScorer from evalforge.instructor_models import LLMAssertion + @pytest.fixture def assertions(): return [ @@ -33,10 +34,12 @@ def assertions(): ), ] + @pytest.fixture def task_description(): return "Transform a dialogue between a doctor and a patient into a structured medical note summary, adhering to privacy guidelines and specified formatting instructions." + @pytest.fixture def input_data(): return { @@ -51,6 +54,7 @@ def input_data(): ) } + @pytest.fixture def model_output(): return { @@ -65,18 +69,21 @@ def model_output(): ) } + @pytest.mark.asyncio -async def test_llm_assertion_scorer(assertions, task_description, input_data, model_output): +async def test_llm_assertion_scorer( + assertions, task_description, input_data, model_output +): scorer = LLMAssertionScorer(assertions=assertions) - results = await scorer.score(model_output, task_description, input_data) - + results = await scorer.score( + model_output=model_output, + input_data=input_data, + task_description=task_description, + ) + assert "llm_assertion_results" in results assert len(results["llm_assertion_results"]) == len(assertions) - - for test_name, result in results["llm_assertion_results"].items(): - assert "score" in result - assert "result" in result - assert "type" in result - assert result["type"] == "llm" - assert result["score"] in [0, 1] - assert result["result"] in ["PASS", "FAIL"] \ No newline at end of file + + for test_name, score in results["llm_assertion_results"].items(): + assert isinstance(score, int) + assert score in [0, 1] diff --git a/tests/test_utils.py b/tests/test_utils.py index 694c2bc..96fddba 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -12,80 +12,84 @@ SuperEncoder, ) + # Test data class TestModel(BaseModel): name: str value: int + @pytest.fixture def temp_jsonl(tmp_path): file_path = tmp_path / "test.jsonl" - test_data = [ - {"a": 1, "b": 2}, - {"c": 3, "d": 4} - ] + test_data = [{"a": 1, "b": 2}, {"c": 3, "d": 4}] with open(file_path, "w") as f: for item in test_data: f.write(json.dumps(item) + "\n") return file_path, test_data + def test_load_jsonl(temp_jsonl): file_path, expected_data = temp_jsonl loaded_data = load_jsonl(file_path) assert loaded_data == expected_data + def test_save_jsonl(tmp_path): file_path = tmp_path / "output.jsonl" test_data = [ {"normal": "data"}, {"numpy": np.int64(42)}, {"array": np.array([1, 2, 3])}, - {"pydantic": TestModel(name="test", value=1)} + {"pydantic": TestModel(name="test", value=1)}, ] - + save_jsonl(test_data, file_path) loaded_data = load_jsonl(file_path) - + assert loaded_data[0] == {"normal": "data"} assert loaded_data[1] == {"numpy": 42} assert loaded_data[2] == {"array": [1, 2, 3]} assert loaded_data[3] == {"pydantic": {"name": "test", "value": 1}} + def test_listify(): # Test empty list assert listify([]) == "- None" - + # Test normal list items = ["apple", "banana", "orange"] expected = "- apple\n- banana\n- orange" assert listify(items) == expected + def test_super_encoder(): encoder = SuperEncoder() - + # Test numpy types assert pytest.approx(encoder.default(np.int64(42))) == 42 assert pytest.approx(encoder.default(np.float32(3.14))) == 3.14 assert encoder.default(np.bool_(True)) == True assert pytest.approx(encoder.default(np.array([1, 2, 3]))) == [1, 2, 3] - + # Test pydantic model model = TestModel(name="test", value=1) assert encoder.default(model) == {"name": "test", "value": 1} - + # Test unsupported type with pytest.raises(TypeError): encoder.default(set()) + def test_pprint(capsys): # Test dictionary printing test_dict = {"a": 1, "b": 2} pprint(test_dict) captured = capsys.readouterr() - assert json.loads(captured.out) == test_dict - - # Test non-dictionary printing + assert eval(captured.out.strip()) == test_dict + + # Test string printing test_str = "Hello, World!" pprint(test_str) captured = capsys.readouterr() - assert captured.out.strip() == test_str \ No newline at end of file + assert captured.out.strip() == f"'{test_str}'"