From 60c1c44d3e86aaffee8ce7df5b7f9eb160fceae2 Mon Sep 17 00:00:00 2001
From: Thomas Capelle <tcapelle@pm.me>
Date: Fri, 8 Nov 2024 15:27:09 +0100
Subject: [PATCH] lint

---
 evalforge/__init__.py                |   5 +-
 evalforge/alignment.py               |  20 ++--
 evalforge/cli.py                     |  72 +++++---------
 evalforge/code_evaluator.py          |  32 +++---
 evalforge/code_formatter.py          |  15 +--
 evalforge/combined_scorer.py         |  49 +++++-----
 evalforge/criterion_assertion_map.py |  12 +--
 evalforge/data_utils.py              | 140 +++++++++++++++++----------
 evalforge/forge.py                   | 103 ++++++++++++--------
 evalforge/instructor_models.py       |  32 ++++--
 evalforge/llm.py                     |   6 +-
 evalforge/llm_evaluator.py           |   8 +-
 evalforge/prompts.py                 |  56 +++++++----
 evalforge/utils.py                   |  96 +++++++++++-------
 forge_medical.py                     |  29 ++++--
 forge_mini.py                        |  45 +++++----
 tests/test_code_formatter.py         |  39 +++++---
 tests/test_combined_scorer.py        |  46 ++++++++-
 tests/test_data_utils.py             |  43 +++++---
 tests/test_llm_evaluator.py          |  29 +++---
 tests/test_utils.py                  |  34 ++++---
 21 files changed, 548 insertions(+), 363 deletions(-)

diff --git a/evalforge/__init__.py b/evalforge/__init__.py
index 5ecb090..e1378c6 100644
--- a/evalforge/__init__.py
+++ b/evalforge/__init__.py
@@ -1,2 +1,5 @@
 from evalforge.forge import EvalForge
-from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics
\ No newline at end of file
+from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics
+
+
+__all__ = ["EvalForge", "calculate_alignment_metrics", "format_alignment_metrics"]
diff --git a/evalforge/alignment.py b/evalforge/alignment.py
index 42aadbe..e45d076 100644
--- a/evalforge/alignment.py
+++ b/evalforge/alignment.py
@@ -168,15 +168,13 @@ def calculate_alignment_metrics(
 
 
 def select_best_assertions(
-    metrics: Dict[str, Any],
-    assertion_results: Dict[str, Dict[str, List[Tuple[Dict[str, Any], int]]]],
-    num_assertions_per_criterion: int = None,
-) -> Dict[str, Dict[str, str]]:
-
-    best_assertions = {}
+    criterion_assertion_results: Dict[str, Dict[str, Dict[str, Any]]],
+    num_assertions_per_criterion: Optional[int] = None,
+) -> List[str]:
+    best_subset: List[str] = []
 
-    for criterion in assertion_results.keys():
-        all_assertions = list(assertion_results[criterion].keys())
+    for criterion in criterion_assertion_results.keys():
+        all_assertions = list(criterion_assertion_results[criterion].keys())
 
         if not num_assertions_per_criterion:
             # Intelligently select the subset of assertions that maximize the criterion's alignment score
@@ -192,7 +190,7 @@ def select_best_assertions(
                     # Create subset of assertion_results
                     subset_assertion_results = {
                         criterion: {
-                            assertion: assertion_results[criterion][assertion]
+                            assertion: criterion_assertion_results[criterion][assertion]
                             for assertion in subset
                         }
                     }
@@ -305,7 +303,7 @@ def format_alignment_metrics(metrics, title: str = "Alignment Metrics"):
             criterion[:40].ljust(40),
             "",
             "",
-            f"{criterion_data['criterion_metrics']['alignment']:.2f}"
+            f"{criterion_data['criterion_metrics']['alignment']:.2f}",
         )
         # Add rows for each assertion
         for assertion, assertion_data in criterion_data["per_assertion"].items():
@@ -313,7 +311,7 @@ def format_alignment_metrics(metrics, title: str = "Alignment Metrics"):
                 "",
                 assertion[:40].ljust(40),
                 assertion_data["type"].ljust(9),
-                f"{assertion_data['alignment']:.2f}"
+                f"{assertion_data['alignment']:.2f}",
             )
 
     # Print the table using the logger's console
diff --git a/evalforge/cli.py b/evalforge/cli.py
index 2c9e838..c3568c3 100644
--- a/evalforge/cli.py
+++ b/evalforge/cli.py
@@ -1,74 +1,49 @@
 import asyncio
+from typing import Optional
 import simple_parsing
 from simple_parsing import Serializable
-from dataclasses import dataclass, Field
+from dataclasses import dataclass
 import sys
 
+import weave
+
 from evalforge.forge import EvalForge
 from evalforge.utils import logger
-from evalforge.data_utils import load_data, DataPoint
+from evalforge.data_utils import load_data
 
-train_ds_formatted = [
-    DataPoint(
-        input_data={"text": "1+1="}, 
-        output_data={"text": "2"}, 
-        annotation=1, 
-        note="Correct summation",
-    ), 
-    DataPoint(
-        input_data={"text": "1+1="}, 
-        output_data={"text": "3"}, 
-        annotation=0, 
-        note="Incorrect summation",
-    ),
-    DataPoint(
-        input_data={"text": "What is the square root of 16?"}, 
-        output_data={"text": "4"}, 
-        annotation=1, 
-        note="Correct square root",
-    ),
-]
+MINI_DATASET_PATH = "data/mini_data.jsonl"
 
-eval_ds_formatted = [
-    DataPoint(
-        input_data={"text": "What is the square root of 16?"}, 
-        output_data={"text": "4"}, 
-        annotation=1, 
-        note="Correct square root",
-    ),
-    DataPoint(
-        input_data={"text": "What is the square root of 16?"}, 
-        output_data={"text": "3"}, 
-        annotation=0, 
-        note="Incorrect square root",
-    ),
-]
 
 @dataclass
 class Args(Serializable):
-    data: str = "mini"# "Path to training data"
-    batch_size: int = 1 # "Batch size"
-    num_criteria_to_generate: int = 1 # "Number of criteria to generate"
-    llm_model: str = "gpt-4o" # "LLM model to use"
+    data: str = "mini"  # "Path to training data"
+    batch_size: int = 1  # "Batch size"
+    num_criteria_to_generate: int = 1  # "Number of criteria to generate"
+    llm_model: str = "gpt-4o"  # "LLM model to use"
+    weave_project: Optional[str] = None  # "Weave project to use"
+
 
 def forge():
     logger.rule("EvalForge CLI")
     try:
         args = simple_parsing.parse(Args)
 
-        # Load the data
+        # Log into Weave
+        if args.weave_project:
+            weave.init(args.weave_project)
 
+        # Load the data
         if args.data == "mini":
-            logger.info(f"Running dummy data")
-            train_data = train_ds_formatted
+            logger.info("Running dummy data")
+            train_data = load_data(MINI_DATASET_PATH)
         else:
             logger.info(f"Loading data from {args.data}")
             train_data = load_data(args.data)
-        
+
         forger = EvalForge(
-            batch_size=args.batch_size, 
-            num_criteria_to_generate=args.num_criteria_to_generate, 
-            llm_model=args.llm_model
+            batch_size=args.batch_size,
+            num_criteria_to_generate=args.num_criteria_to_generate,
+            llm_model=args.llm_model,
         )
         # Run the fit method
         asyncio.run(forger.fit(train_data))
@@ -76,5 +51,6 @@ def forge():
         print(f"An error occurred: {e}")
         sys.exit(1)
 
+
 if __name__ == "__main__":
-    forge() 
\ No newline at end of file
+    forge()
diff --git a/evalforge/code_evaluator.py b/evalforge/code_evaluator.py
index 572ce3e..466ec7d 100644
--- a/evalforge/code_evaluator.py
+++ b/evalforge/code_evaluator.py
@@ -22,17 +22,19 @@ def score(
         self,
         model_output: Optional[Dict[str, Any]],
         input_data: Dict[str, Any],
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         if model_output is None:
             logger.error("No model output provided")
-            return {"code_assertion_results": {
-                "tests_run": 0,
-                "passed": 0,
-                "failures": 0,
-                "errors": 0,
-                "test_results": {}
-            }}
+            return {
+                "code_assertion_results": {
+                    "tests_run": 0,
+                    "passed": 0,
+                    "failures": 0,
+                    "errors": 0,
+                    "test_results": {},
+                }
+            }
 
         try:
             # Use the code_formatter to write assertions to files
@@ -55,10 +57,14 @@ def run_tests(self, temp_dir: str, output: Any) -> str:
         import json
 
         # Create a test context with both the model output and input data
-        test_context = json.dumps({
-            "output": output,
-            "input": output.get("input_data", {}) if isinstance(output, dict) else {}
-        })
+        test_context = json.dumps(
+            {
+                "output": output,
+                "input": (
+                    output.get("input_data", {}) if isinstance(output, dict) else {}
+                ),
+            }
+        )
 
         # Run the test suite using subprocess and capture the output
         result = subprocess.run(
@@ -114,4 +120,4 @@ def parse_test_results(self, test_output: str) -> Dict[str, Any]:
             "failures": failures,
             "errors": errors,
             "test_results": test_result_dict,
-        }
\ No newline at end of file
+        }
diff --git a/evalforge/code_formatter.py b/evalforge/code_formatter.py
index b3494d1..419ddc5 100644
--- a/evalforge/code_formatter.py
+++ b/evalforge/code_formatter.py
@@ -3,7 +3,7 @@
 import os
 import textwrap
 from datetime import datetime
-from typing import Dict, Optional, Set
+from typing import Optional, Set
 
 import autopep8
 import isort
@@ -12,6 +12,7 @@
 
 from evalforge.instructor_models import PythonAssertion
 
+
 class CodeFormatter(weave.Object):
     @weave.op
     def lint_code(self, code: str) -> str:
@@ -27,8 +28,8 @@ def lint_code(self, code: str) -> str:
         code = autopep8.fix_code(code, options={"aggressive": 2})
         return code
 
-    def get_required_imports(self, tree: ast.AST) -> Set[tuple]:
-        required_imports = set()
+    def get_required_imports(self, tree: ast.AST) -> Set[tuple[Optional[str], str]]:
+        required_imports: Set[tuple[Optional[str], str]] = set()
         for node in ast.walk(tree):
             if isinstance(node, ast.Name):
                 if not self.is_builtin(node.id):
@@ -61,11 +62,11 @@ def write_assertions_to_files(
         self, assertions: list[PythonAssertion], base_dir: Optional[str] = None
     ) -> str:
         """Write assertions to test files in the specified directory.
-        
+
         Args:
             assertions: List of PythonAssertion objects
             base_dir: Optional directory to write files to. If None, creates a timestamped directory
-            
+
         Returns:
             str: Path to the base directory containing the generated files
         """
@@ -105,7 +106,7 @@ def create_test_file_content(self, assertion_name: str, assertion_code: str) ->
         dedented_assertion_code = textwrap.dedent(assertion_code).strip()
         # Re-indent the assertion code to match the class indentation (4 spaces)
         indented_assertion_code = textwrap.indent(dedented_assertion_code, "    ")
-        
+
         return f"""{imports}
 
 class Test_{assertion_name}(OutputTestCase):
@@ -161,4 +162,4 @@ def load_tests(loader, standard_tests, pattern):
     # Exit with non-zero status if there were failures
     if not unittest.TextTestRunner().run(load_tests(None, None, None)).wasSuccessful():
         sys.exit(1)
-"""
\ No newline at end of file
+"""
diff --git a/evalforge/combined_scorer.py b/evalforge/combined_scorer.py
index 524b18d..c0f5a4d 100644
--- a/evalforge/combined_scorer.py
+++ b/evalforge/combined_scorer.py
@@ -1,13 +1,11 @@
-import asyncio
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 
 import weave
 from pydantic import Field
 
 from evalforge.code_evaluator import CodeAssertionScorer, CodeFormatter
-from evalforge.instructor_models import (Criterion, LLMAssertion,
-                                         PythonAssertion)
+from evalforge.instructor_models import Criterion, LLMAssertion, PythonAssertion
 from evalforge.llm import DEFAULT_LLM_MODEL
 from evalforge.llm_evaluator import LLMAssertionScorer
 from evalforge.criterion_assertion_map import CriterionAssertionMap
@@ -21,12 +19,6 @@ def predict_passthrough(
     return model_output
 
 
-from typing import Any, Dict
-
-import weave
-
-
-
 class AssertionScorer(weave.Scorer):
     criterion_assertion_map: CriterionAssertionMap = Field(
         default_factory=CriterionAssertionMap
@@ -73,11 +65,13 @@ async def score(
                 system_prompt=self.system_prompt,
             )
             llm_results = await llm_scorer.score(
-                model_output=model_output, 
+                model_output=model_output,
                 input_data=input_data,
                 task_description=self.task_description,
             )
-            results["llm_assertion_results"] = llm_results.get("llm_assertion_results", {})
+            results["llm_assertion_results"] = llm_results.get(
+                "llm_assertion_results", {}
+            )
 
         # Process Python assertions
         if python_assertions:
@@ -86,34 +80,36 @@ async def score(
                 code_formatter=self.code_formatter,
             )
             code_results = code_scorer.score(
-                model_output=model_output, 
-                input_data=input_data
+                model_output=model_output, input_data=input_data
             )
-            results["code_assertion_results"] = code_results.get("code_assertion_results", {}).get("test_results", {})
+            results["code_assertion_results"] = code_results.get(
+                "code_assertion_results", {}
+            ).get("test_results", {})
 
         # Map results back to criteria using the mapping class
         criterion_results: Dict[str, Dict[str, Any]] = {}
         for test_name, result in results.get("llm_assertion_results", {}).items():
-            criterion = self.criterion_assertion_map.get_criterion_by_assertion(test_name)
+            criterion = self.criterion_assertion_map.get_criterion_by_assertion(
+                test_name
+            )
             if criterion not in criterion_results:
                 criterion_results[criterion] = {}
-            criterion_results[criterion][test_name] = {
-                "score": result,
-                "type": "llm"
-            }
+            criterion_results[criterion][test_name] = {"score": result, "type": "llm"}
 
         for test_name, result in results.get("code_assertion_results", {}).items():
-            criterion = self.criterion_assertion_map.get_criterion_by_assertion(test_name)
+            criterion = self.criterion_assertion_map.get_criterion_by_assertion(
+                test_name
+            )
             if criterion not in criterion_results:
                 criterion_results[criterion] = {}
             criterion_results[criterion][test_name] = {
                 "score": result["score"],
-                "type": "python"
+                "type": "python",
             }
 
         return criterion_results
 
-    def export(self, base_dir: str = "forged_judge"):
+    def export(self, base_dir: Union[str, Path] = "forged_judge") -> None:
         base_dir = Path(base_dir)
         llm_dir = base_dir / "llm_assertions"
         python_dir = base_dir / "python_assertions"
@@ -172,7 +168,12 @@ def import_assertions(self, base_dir: str = "forged_judge"):
             criterion = item["criterion"]
             assertion = item["assertion"]
             self.criterion_assertion_map.add_assertion(
-                Criterion(criterion=criterion), assertion
+                Criterion(
+                    criterion=criterion,
+                    explanation="Imported criterion",
+                    evaluation_method="mixed",
+                ),
+                assertion,
             )
 
     def load_assertions_by_criteria(self, base_dir: Path, assertion_cls):
diff --git a/evalforge/criterion_assertion_map.py b/evalforge/criterion_assertion_map.py
index 097f372..99a07f2 100644
--- a/evalforge/criterion_assertion_map.py
+++ b/evalforge/criterion_assertion_map.py
@@ -2,8 +2,7 @@
 
 import weave
 
-from evalforge.instructor_models import (Criterion, LLMAssertion,
-                                         PythonAssertion)
+from evalforge.instructor_models import Criterion, LLMAssertion, PythonAssertion
 
 
 class CriterionAssertionMap(weave.Object):
@@ -24,13 +23,14 @@ def get_assertions_by_criterion(
     ) -> List[Union[LLMAssertion, PythonAssertion]]:
         return self.criterion_to_assertions.get(criterion_name, [])
 
-    def get_criterion_by_assertion(self, assertion_name: str) -> str:
+    def get_criterion_by_assertion(self, assertion_name: str) -> str | None:
+        """Get the criterion associated with a given assertion name."""
         return self.assertion_to_criterion.get(assertion_name)
 
     @classmethod
     def from_assertions(cls, criterion_assertion_pairs):
         instance = cls()
-        for criterion, assertions in criterion_assertion_pairs:
-            for assertion in assertions:
-                instance.add_assertion(criterion, assertion)
+        for assertion in criterion_assertion_pairs:
+            criterion = Criterion(criterion=assertion.test_name)
+            instance.add_assertion(criterion, assertion)
         return instance
diff --git a/evalforge/data_utils.py b/evalforge/data_utils.py
index 824c373..3c1bcab 100644
--- a/evalforge/data_utils.py
+++ b/evalforge/data_utils.py
@@ -10,6 +10,7 @@
 from evalforge.llm import llm_client
 from evalforge.instructor_models import DatasetMapping
 
+
 class DataPoint(BaseModel):
     input_data: Dict[str, Any] = Field(
         description="The input data provided to the model for evaluation"
@@ -22,15 +23,15 @@ class DataPoint(BaseModel):
     )
     note: Optional[str] = Field(
         default=None,
-        description="Optional note providing additional context about the annotation"
+        description="Optional note providing additional context about the annotation",
     )
     human_description: Optional[str] = Field(
         default=None,
-        description="Optional human-provided description of the task or evaluation criteria"
+        description="Optional human-provided description of the task or evaluation criteria",
     )
     additional_context: Optional[str] = Field(
         default=None,
-        description="Optional field for any additional context or metadata"
+        description="Optional field for any additional context or metadata",
     )
 
     def format(self, index: Optional[int] = None) -> str:
@@ -38,18 +39,20 @@ def format(self, index: Optional[int] = None) -> str:
         parts = []
         if index is not None:
             parts.append(f"Example {index}:")
-            
-        parts.extend([
-            "Input:",
-            json.dumps(self.input_data, indent=2),
-            "",
-            "Output:",
-            json.dumps(self.output_data, indent=2),
-            "",
-            f"Annotation: {'Correct' if self.annotation == 1 else 'Incorrect'}",
-            f"Note: {self.note or 'N/A'}",
-            "\n" + "-" * 50 + "\n"
-        ])
+
+        parts.extend(
+            [
+                "Input:",
+                json.dumps(self.input_data, indent=2),
+                "",
+                "Output:",
+                json.dumps(self.output_data, indent=2),
+                "",
+                f"Annotation: {'Correct' if self.annotation == 1 else 'Incorrect'}",
+                f"Note: {self.note or 'N/A'}",
+                "\n" + "-" * 50 + "\n",
+            ]
+        )
         return "\n".join(parts)
 
     def to_dict(self, task_description: Optional[str] = None) -> Dict[str, Any]:
@@ -58,26 +61,30 @@ def to_dict(self, task_description: Optional[str] = None) -> Dict[str, Any]:
             "input_data": self.input_data,
             "model_output": {"output": self.output_data},
             "annotation": self.annotation,
-            "note": self.note
+            "note": self.note,
         }
         if task_description:
             result["task_description"] = task_description
         return result
 
     @classmethod
-    def format_batch(cls, 
-                    datapoints: List['DataPoint'], 
-                    finalized_task_description: str) -> str:
+    def format_batch(
+        cls, datapoints: List["DataPoint"], finalized_task_description: str
+    ) -> str:
         """Format a batch of datapoints with optional task description"""
         parts = []
         if finalized_task_description:
             parts.append(f"Task Description: {finalized_task_description}\n")
-        
+
         parts.extend(dp.format(i + 1) for i, dp in enumerate(datapoints))
         return "\n".join(parts)
 
     @classmethod
-    def from_example(cls, example: Union[Dict[str, Any], List[Any]], mapping: Optional[Dict[str, str]] = None) -> Optional['DataPoint']:
+    def from_example(
+        cls,
+        example: Union[Dict[str, Any], List[Any]],
+        mapping: Optional[Dict[str, str]] = None,
+    ) -> Optional["DataPoint"]:
         """
         Create a DataPoint object from a raw example dictionary or list.
         """
@@ -85,13 +92,21 @@ def from_example(cls, example: Union[Dict[str, Any], List[Any]], mapping: Option
             # Handle list-structured data
             if len(example) >= 4:
                 return cls(
-                    input_data={"text": example[0]["input"]} if isinstance(example[0], dict) else {"text": example[0]},
-                    output_data={"text": example[1]["output"]} if isinstance(example[1], dict) else {"text": example[1]},
+                    input_data=(
+                        {"text": example[0]["input"]}
+                        if isinstance(example[0], dict)
+                        else {"text": example[0]}
+                    ),
+                    output_data=(
+                        {"text": example[1]["output"]}
+                        if isinstance(example[1], dict)
+                        else {"text": example[1]}
+                    ),
                     annotation=int(example[2]),
-                    note=example[3]
+                    note=example[3],
                 )
             return None
-        
+
         if mapping is None:
             try:
                 # Attempt to directly parse the example
@@ -102,25 +117,33 @@ def from_example(cls, example: Union[Dict[str, Any], List[Any]], mapping: Option
                 return None
         else:
             # Use the mapping to create the DataPoint
-            input_data_key = mapping.get('input_data', None)
-            output_data_key = mapping.get('output_data', None)
-            annotation_key = mapping.get('annotation', None)
-            note_key = mapping.get('note', None)
-            human_description_key = mapping.get('human_description', None)
-            additional_context_key = mapping.get('additional_context', None)
+            input_data_key = mapping.get("input_data", None)
+            output_data_key = mapping.get("output_data", None)
+            annotation_key = mapping.get("annotation", None)
+            note_key = mapping.get("note", None)
+            human_description_key = mapping.get("human_description", None)
+            additional_context_key = mapping.get("additional_context", None)
 
             input_data = example.get(input_data_key, {}) if input_data_key else {}
             output_data = example.get(output_data_key, {}) if output_data_key else {}
             annotation = int(example.get(annotation_key, 0)) if annotation_key else 0
             note = example.get(note_key, None) if note_key else None
-            human_description = example.get(human_description_key, None) if human_description_key else None
-            additional_context = example.get(additional_context_key, None) if additional_context_key else None
+            human_description = (
+                example.get(human_description_key, None)
+                if human_description_key
+                else None
+            )
+            additional_context = (
+                example.get(additional_context_key, None)
+                if additional_context_key
+                else None
+            )
 
             # If input_data and output_data are strings, wrap them in dictionaries
             if isinstance(input_data, str):
-                input_data = {'text': input_data}
+                input_data = {"text": input_data}
             if isinstance(output_data, str):
-                output_data = {'text': output_data}
+                output_data = {"text": output_data}
 
             return cls(
                 input_data=input_data,
@@ -139,20 +162,23 @@ def from_example(cls, example: Union[Dict[str, Any], List[Any]], mapping: Option
                 "annotation": 1,
                 "note": "This is a good response",
                 "human_description": "Task involves evaluating text responses",
-                "additional_context": "From validation set"
+                "additional_context": "From validation set",
             }
         }
     }
 
+
 @weave.op
-def generate_mapping(sample: Union[Dict[str, Any], List[Any]], llm_model: str = "gpt-4") -> Dict[str, str]:
+def generate_mapping(
+    sample: Union[Dict[str, Any], List[Any]], llm_model: str = "gpt-4"
+) -> Dict[str, str]:
     """Generate a mapping from dataset columns to DataPoint fields using an LLM."""
     logger.info(f"► Using {llm_model} to generate mapping to DataPoint fields")
-    
+
     # Input validation
     if sample is None or (isinstance(sample, (list, dict)) and len(sample) == 0):
         raise ValueError("Sample cannot be None or empty")
-    
+
     prompt = f"""
 Given the following data sample:
 ```json
@@ -170,8 +196,11 @@ def generate_mapping(sample: Union[Dict[str, Any], List[Any]], llm_model: str =
             model=llm_model,
             response_model=DatasetMapping,
             messages=[
-                {"role": "system", "content": "You are a helpful assistant that creates mappings between dataset fields and DataPoint fields."},
-                {"role": "user", "content": prompt}
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant that creates mappings between dataset fields and DataPoint fields.",
+                },
+                {"role": "user", "content": prompt},
             ],
         )
         mapping = mapping_instruction.model_dump()
@@ -181,22 +210,31 @@ def generate_mapping(sample: Union[Dict[str, Any], List[Any]], llm_model: str =
         logger.error(f"Error generating mapping: {str(e)}")
         raise ValueError(f"Failed to generate mapping: {str(e)}")
 
-def load_data(data_source: Union[str, Iterable[Dict[str, Any]]], llm_model: str = "gpt-4o") -> List[DataPoint]:
+
+def load_data(
+    data_source: Union[str, Iterable[Dict[str, Any]]], llm_model: str = "gpt-4o"
+) -> List[DataPoint]:
     """
     Load data from a file path or an iterable of dictionaries and convert it into a list of DataPoint objects.
     Automatically map dataset columns to DataPoint fields using an LLM if necessary.
     """
     data_points = []
     samples = []
-    data_source_name = data_source if isinstance(data_source, str) else type(data_source).__name__
+    data_source_name = (
+        data_source if isinstance(data_source, str) else type(data_source).__name__
+    )
     logger.rule(f"Loading data from {data_source_name}", color="blue")
     if isinstance(data_source, str):
         # Existing logic for loading data from a file path
-        if data_source.endswith('.json') or data_source.endswith('.jsonl'):
-            with open(data_source, 'r') as f:
-                samples = [json.loads(line) for line in f] if data_source.endswith('.jsonl') else json.load(f)
-        elif data_source.endswith('.csv'):
-            with open(data_source, newline='', encoding='utf-8') as csvfile:
+        if data_source.endswith(".json") or data_source.endswith(".jsonl"):
+            with open(data_source, "r") as f:
+                samples = (
+                    [json.loads(line) for line in f]
+                    if data_source.endswith(".jsonl")
+                    else json.load(f)
+                )
+        elif data_source.endswith(".csv"):
+            with open(data_source, newline="", encoding="utf-8") as csvfile:
                 reader = csv.DictReader(csvfile)
                 samples = [row for row in reader]
         else:
@@ -204,10 +242,10 @@ def load_data(data_source: Union[str, Iterable[Dict[str, Any]]], llm_model: str
     else:
         # New logic for handling an iterable of data
         samples = list(data_source)
-    
+
     if not samples:
         raise ValueError("The dataset is empty.")
-    
+
     # Try to model_validate the first sample directly
     first_sample = samples[0]
     data_point = DataPoint.from_example(first_sample)
@@ -225,4 +263,4 @@ def load_data(data_source: Union[str, Iterable[Dict[str, Any]]], llm_model: str
             if data_point:
                 data_points.append(data_point)
     logger.info(f"Loaded {len(data_points)} datapoints")
-    return data_points
\ No newline at end of file
+    return data_points
diff --git a/evalforge/forge.py b/evalforge/forge.py
index 6a64b3c..d0abf84 100644
--- a/evalforge/forge.py
+++ b/evalforge/forge.py
@@ -1,20 +1,24 @@
-import asyncio
 from typing import Any, Dict, List, Optional, Tuple
 from jinja2 import Template
 import random
 import weave
-from litellm import acompletion
 
 from evalforge.combined_scorer import AssertionScorer
 from evalforge.criterion_assertion_map import CriterionAssertionMap
-from evalforge.alignment import (calculate_alignment_metrics,
-                                           filter_assertion_results,
-                                           format_alignment_metrics,
-                                           select_best_assertions,
-                                           select_best_criteria)
-from evalforge.instructor_models import (CombinedTaskDescription, Criterion,
-                                         CriterionAssertions,
-                                         EvaluationCriteria, TaskDescription)
+from evalforge.alignment import (
+    calculate_alignment_metrics,
+    filter_assertion_results,
+    format_alignment_metrics,
+    select_best_assertions,
+    select_best_criteria,
+)
+from evalforge.instructor_models import (
+    CombinedTaskDescription,
+    Criterion,
+    CriterionAssertions,
+    EvaluationCriteria,
+    TaskDescription,
+)
 from evalforge.llm import llm_aclient, DEFAULT_LLM_MODEL
 from evalforge.prompts import (
     TASK_PROMPT,
@@ -51,16 +55,19 @@ class EvalForge(weave.Model, Serializable):
     def shuffle_and_batch_data(self, data: List[DataPoint]) -> List[List[DataPoint]]:
         "Shuffle and batch the data into smaller lists of datapoints"
         shuffled_data = random.sample(data, len(data))
-        return [shuffled_data[i:i+self.batch_size] for i in range(0, len(shuffled_data), self.batch_size)]
+        return [
+            shuffled_data[i : i + self.batch_size]
+            for i in range(0, len(shuffled_data), self.batch_size)
+        ]
 
     def format_samples(self, batch: List[DataPoint]) -> List[Dict[str, Any]]:
         # Helper method to format samples
         return [
             {
-                'input_data': dp.input_data,
-                'output_data': dp.output_data,
-                'annotation': dp.annotation,
-                'note': dp.note
+                "input_data": dp.input_data,
+                "output_data": dp.output_data,
+                "annotation": dp.annotation,
+                "note": dp.note,
             }
             for dp in batch
         ]
@@ -69,22 +76,21 @@ def format_samples(self, batch: List[DataPoint]) -> List[Dict[str, Any]]:
     async def get_task_description(self, data: List[DataPoint]) -> str:
         batched_data = self.shuffle_and_batch_data(data)
         task_description = ""
-        
+
         for batch in tqdm(batched_data, desc="Refining task description"):
             samples = self.format_samples(batch)
             template = Template(self.task_prompt)
             formatted_prompt = template.render(
-                task_description=task_description,
-                samples=samples
+                task_description=task_description, samples=samples
             )
 
             response = await llm_aclient.chat.completions.create(
                 model=self.llm_model,
                 messages=[
                     {"role": "system", "content": self.task_system_prompt},
-                    {"role": "user", "content": formatted_prompt}
+                    {"role": "user", "content": formatted_prompt},
                 ],
-                response_model=TaskDescription
+                response_model=TaskDescription,
             )
             task_description = response.description
 
@@ -124,12 +130,12 @@ async def process_criteria(
         self, data: List[DataPoint], all_criteria: str, finalized_task_description: str
     ) -> EvaluationCriteria:
         formatted_data = DataPoint.format_batch(data, finalized_task_description)
-        
+
         prompt = self.criteria_prompt.format(
             formatted_data=formatted_data,
             generated_criteria=str([c.model_dump() for c in all_criteria]),
         )
-        
+
         response = await llm_aclient.chat.completions.create(
             model=self.llm_model,
             messages=[
@@ -145,16 +151,21 @@ async def generate_criteria(
         self, data: List[DataPoint], finalized_task_description: str
     ) -> List[Criterion]:
         all_criteria = []
-        
+
         for _ in tqdm(range(self.num_criteria_to_generate), desc="Generating criteria"):
-            response = await self.process_criteria(data, all_criteria, finalized_task_description)
+            response = await self.process_criteria(
+                data, all_criteria, finalized_task_description
+            )
             all_criteria.extend(response.criteria)
 
         return all_criteria
 
     @weave.op
     async def create_candidate_assertions(
-        self, data: List[DataPoint], criterion: Criterion, finalized_task_description: str
+        self,
+        data: List[DataPoint],
+        criterion: Criterion,
+        finalized_task_description: str,
     ) -> CriterionAssertions:
         formatted_data = DataPoint.format_batch(data, finalized_task_description)
         prompt = self.candidate_assertion_prompt.format(
@@ -172,7 +183,9 @@ async def create_candidate_assertions(
         return response
 
     @weave.op
-    async def generate_all_assertions(self, criteria, data: List[DataPoint], finalized_task_description: str):
+    async def generate_all_assertions(
+        self, criteria, data: List[DataPoint], finalized_task_description: str
+    ):
         async def process_criterion(criterion):
             candidate_assertions = await self.create_candidate_assertions(
                 data, criterion, finalized_task_description
@@ -182,7 +195,7 @@ async def process_criterion(criterion):
 
         # Create list of coroutines
         coros = [process_criterion(criterion) for criterion in criteria]
-        
+
         # Use tqdm_gather to run coroutines concurrently with progress bar
         results = await tqdm_gather(coros, desc="Generating assertions")
 
@@ -191,9 +204,7 @@ async def process_criterion(criterion):
 
     @weave.op
     async def run_assertions(
-        self, 
-        scorer: AssertionScorer, 
-        data: List[DataPoint]
+        self, scorer: AssertionScorer, data: List[DataPoint]
     ) -> Dict[str, Dict[str, List[Tuple[int, int]]]]:
         criterion_assertion_results = {}
 
@@ -249,7 +260,7 @@ async def create_and_evaluate_scorers(
         all_assertions: CriterionAssertionMap,
         train_data: List[DataPoint],
         criteria: List[Criterion],
-        finalized_task_description: str
+        finalized_task_description: str,
     ) -> Tuple[Dict, Dict]:
         """Creates and evaluates both initial and final scorers in one cohesive flow"""
         # Create initial scorer
@@ -262,7 +273,8 @@ async def create_and_evaluate_scorers(
         # Run assertions and calculate initial metrics
         assertion_results = await self.run_assertions(initial_scorer, train_data)
         initial_metrics = calculate_alignment_metrics(assertion_results)
-        
+        format_alignment_metrics(initial_metrics, title="Initial alignment metrics")
+
         if not initial_metrics:
             logger.warning("No metrics calculated from assertion results")
             return {}, {}
@@ -273,7 +285,9 @@ async def create_and_evaluate_scorers(
             assertion_results,
             num_assertions_per_criterion=self.num_assertions_per_criterion,
         )
-        filtered_assertion_results = filter_assertion_results(assertion_results, best_assertions)
+        filtered_assertion_results = filter_assertion_results(
+            assertion_results, best_assertions
+        )
         filtered_metrics = calculate_alignment_metrics(filtered_assertion_results)
 
         if not filtered_metrics:
@@ -296,7 +310,6 @@ async def create_and_evaluate_scorers(
 
         # Format metrics summaries
         format_alignment_metrics(filtered_metrics, title="Final alignment metrics")
-        format_alignment_metrics(initial_metrics, title="Initial alignment metrics")
 
         return (
             {
@@ -308,32 +321,36 @@ async def create_and_evaluate_scorers(
                 "judge": initial_scorer,
                 "alignment_metrics": initial_metrics,
                 "assertion_results": assertion_results,
-            }
+            },
         )
 
     @weave.op
     async def fit(self, train_data: List[DataPoint]) -> Dict[str, Any]:
         logger.rule("Forging judge", color="blue")
-        
+
         with logger.timer("Generating task description"):
             llm_task_description = await self.get_task_description(train_data)
-        
+
         with logger.timer("Combining human and LLM descriptions"):
             finalized_task_description = await self.combine_human_and_llm_descriptions(
                 train_data, llm_task_description
             )
-        
+
         with logger.timer("Generating evaluation criteria"):
-            criteria = await self.generate_criteria(train_data, finalized_task_description)
-        
+            criteria = await self.generate_criteria(
+                train_data, finalized_task_description
+            )
+
         with logger.timer("Generating assertions"):
-            all_assertions = await self.generate_all_assertions(criteria, train_data, finalized_task_description)
-        
+            all_assertions = await self.generate_all_assertions(
+                criteria, train_data, finalized_task_description
+            )
+
         with logger.timer("Creating and evaluating scorers"):
             forged_judges, initial_judges = await self.create_and_evaluate_scorers(
                 all_assertions, train_data, criteria, finalized_task_description
             )
-        
+
         logger.header("EvalForge pipeline completed ✨")
         logger.rule("Finalized task description", color="blue")
         logger.info(finalized_task_description)
diff --git a/evalforge/instructor_models.py b/evalforge/instructor_models.py
index ade3f20..f51e7bc 100644
--- a/evalforge/instructor_models.py
+++ b/evalforge/instructor_models.py
@@ -1,4 +1,4 @@
-from typing import List, Literal, Union, Dict
+from typing import List, Literal, Union
 
 from pydantic import BaseModel, Field
 
@@ -46,8 +46,8 @@ def __eq__(self, other):
 class EvaluationCriteria(BaseModel):
     criteria: List[Criterion] = Field(
         ...,
-        min_items=1,
-        max_items=2,
+        min_length=1,
+        max_length=2,
         description="A list of 1-2 distinct evaluation criteria, each focusing on a different aspect of output quality",
     )
 
@@ -78,8 +78,8 @@ class LLMAssertion(BaseModel):
 class CriterionAssertions(BaseModel):
     assertions: List[Union[PythonAssertion, LLMAssertion]] = Field(
         ...,
-        min_items=1,
-        max_items=3,
+        min_length=1,
+        max_length=3,
         description="Generate 1-3 specific, testable assertions that can be used to evaluate LLM outputs based on the given criterion",
     )
 
@@ -87,12 +87,24 @@ class CriterionAssertions(BaseModel):
 class AssertionEvaluation(BaseModel):
     result: Literal["PASS", "FAIL"] = Field(
         ...,
-        description="The evaluation result of an assertion. Must be either 'PASS' or 'FAIL'."
+        description="The evaluation result of an assertion. Must be either 'PASS' or 'FAIL'.",
     )
 
 
 class DatasetMapping(BaseModel):
-    input_data: str = Field(..., description="The key in the sample data that corresponds to the input_data field in the DataPoint object")
-    output_data: str = Field(..., description="The key in the sample data that corresponds to the output_data field in the DataPoint object")
-    annotation: str = Field(..., description="The key in the sample data that corresponds to the annotation field in the DataPoint object")
-    note: str = Field(..., description="The key in the sample data that corresponds to the note field in the DataPoint object")
+    input_data: str = Field(
+        ...,
+        description="The key in the sample data that corresponds to the input_data field in the DataPoint object",
+    )
+    output_data: str = Field(
+        ...,
+        description="The key in the sample data that corresponds to the output_data field in the DataPoint object",
+    )
+    annotation: str = Field(
+        ...,
+        description="The key in the sample data that corresponds to the annotation field in the DataPoint object",
+    )
+    note: str = Field(
+        ...,
+        description="The key in the sample data that corresponds to the note field in the DataPoint object",
+    )
diff --git a/evalforge/llm.py b/evalforge/llm.py
index c932b2b..7c96026 100644
--- a/evalforge/llm.py
+++ b/evalforge/llm.py
@@ -5,6 +5,7 @@
 
 from evalforge.utils import sanitize_messages
 
+
 # we need this to fix litellm+weave bug
 def sanitize_completion(func):
     @wraps(func)
@@ -12,13 +13,13 @@ async def async_wrapper(*args, **kwargs):
         if "messages" in kwargs:
             kwargs["messages"] = sanitize_messages(kwargs["messages"])
         return await func(*args, **kwargs)
-    
+
     @wraps(func)
     def sync_wrapper(*args, **kwargs):
         if "messages" in kwargs:
             kwargs["messages"] = sanitize_messages(kwargs["messages"])
         return func(*args, **kwargs)
-    
+
     return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper
 
 
@@ -28,4 +29,3 @@ def sync_wrapper(*args, **kwargs):
 # Default model configurations
 DEFAULT_LLM_MODEL = "gpt-4o"  # For high accuracy tasks
 DEFAULT_FAST_MODEL = "gpt-4o-mini"  # For faster, lighter tasks
-
diff --git a/evalforge/llm_evaluator.py b/evalforge/llm_evaluator.py
index 8e554da..00e9438 100644
--- a/evalforge/llm_evaluator.py
+++ b/evalforge/llm_evaluator.py
@@ -1,14 +1,14 @@
 import asyncio
-from typing import Any, Dict, List, Optional, Tuple
-from pydantic import BaseModel
+from typing import Any, Dict, List, Tuple
 
 import weave
 from pydantic import Field
 
 from evalforge.instructor_models import LLMAssertion, AssertionEvaluation
-from evalforge.llm import llm_aclient, DEFAULT_LLM_MODEL
+from evalforge.llm import llm_aclient
 from evalforge.prompts import LLMASSERTION_PROMPT_TEMPLATE, LLMASSERTION_SYSTEM_PROMPT
 
+
 class LLMAssertionScorer(weave.Scorer):
     assertions: List[LLMAssertion] = Field(default_factory=list)
     model: str = Field(default="gpt-4")
@@ -61,4 +61,4 @@ async def score(
         ]
 
         assertion_results = await asyncio.gather(*tasks)
-        return {"llm_assertion_results": dict(assertion_results)}
\ No newline at end of file
+        return {"llm_assertion_results": dict(assertion_results)}
diff --git a/evalforge/prompts.py b/evalforge/prompts.py
index 4c8847e..c0d675d 100644
--- a/evalforge/prompts.py
+++ b/evalforge/prompts.py
@@ -1,7 +1,8 @@
 import textwrap
 
 # Task-related prompts
-TASK_PROMPT = textwrap.dedent("""
+TASK_PROMPT = textwrap.dedent(
+    """
     Current task description: {{ task_description }}
 
     New datapoints:
@@ -26,13 +27,17 @@
     3. Any formatting or style requirements
     4. Evaluation criteria (based on the annotations and notes)
 
-    Keep the description concise yet comprehensive.""")
+    Keep the description concise yet comprehensive."""
+)
 
-TASK_SYSTEM_PROMPT = textwrap.dedent("""
+TASK_SYSTEM_PROMPT = textwrap.dedent(
+    """
     You are an AI assistant designed to help refine task descriptions for a given dataset.
-    """)
+    """
+)
 
-COMBINED_TASK_PROMPT = textwrap.dedent("""
+COMBINED_TASK_PROMPT = textwrap.dedent(
+    """
     LLM-generated task description:
     {llm_description}
 
@@ -46,14 +51,18 @@
     4. The description maintains a professional tone.
     5. It provides a complete picture of the task requirements and evaluation criteria.
 
-    Please provide the combined description in a single, well-structured paragraph.""")
+    Please provide the combined description in a single, well-structured paragraph."""
+)
 
-COMBINED_TASK_SYSTEM_PROMPT = textwrap.dedent("""
+COMBINED_TASK_SYSTEM_PROMPT = textwrap.dedent(
+    """
     You are an AI assistant designed to help refine task descriptions for a given dataset given a LLM-generated task description and additional human-provided context.
-    """)
+    """
+)
 
 # Criteria-related prompts
-CRITERIA_PROMPT = textwrap.dedent("""
+CRITERIA_PROMPT = textwrap.dedent(
+    """
     Analyze the following annotated datapoints:
 
     {formatted_data}
@@ -76,14 +85,18 @@
     [Criterion]: [Brief explanation and evaluation method]
 
     Aim for a mix of straightforward, code-evaluable criteria and more nuanced criteria that might require LLM or human evaluation.
-    """)
+    """
+)
 
-CRITERIA_SYSTEM_PROMPT = textwrap.dedent("""
+CRITERIA_SYSTEM_PROMPT = textwrap.dedent(
+    """
     You are an AI assistant designed to create evaluation criteria for a given task.
-    """)
+    """
+)
 
 # Assertion-related prompts
-CANDIDATE_ASSERTION_PROMPT = textwrap.dedent("""
+CANDIDATE_ASSERTION_PROMPT = textwrap.dedent(
+    """
     Given the following evaluation criterion and annotated data, generate 1-3 specific, testable assertions:
 
     Criterion: {criterion}
@@ -109,14 +122,18 @@
     6. Aim for assertions that could be applied across multiple types of outputs
 
     Ensure that your assertions are directly evaluable and avoid vague or subjective language. Focus on creating assertions that align with human preferences and can be used to validate the quality of LLM-generated evaluations.
-    """)
+    """
+)
 
-CANDIDATE_ASSERTION_SYSTEM_PROMPT = textwrap.dedent("""
+CANDIDATE_ASSERTION_SYSTEM_PROMPT = textwrap.dedent(
+    """
     You are an AI assistant designed to create testable assertions for a given task and criterion.
-    """)
+    """
+)
 
 # LLM Assertion Scorer prompts
-LLMASSERTION_PROMPT_TEMPLATE = textwrap.dedent("""
+LLMASSERTION_PROMPT_TEMPLATE = textwrap.dedent(
+    """
 Task Description:
 {task_description}
 
@@ -133,6 +150,7 @@
 
 Consider the task description and input when evaluating the output against the assertion.
 Respond with either 'PASS' if the output meets the assertion criteria in the context of the task and input, or 'FAIL' if it does not.
-""")
+"""
+)
 
-LLMASSERTION_SYSTEM_PROMPT = "You are an AI assistant evaluating the quality of text outputs based on given tasks, inputs, and assertions."
\ No newline at end of file
+LLMASSERTION_SYSTEM_PROMPT = "You are an AI assistant evaluating the quality of text outputs based on given tasks, inputs, and assertions."
diff --git a/evalforge/utils.py b/evalforge/utils.py
index 5fffb3a..a013b54 100644
--- a/evalforge/utils.py
+++ b/evalforge/utils.py
@@ -7,7 +7,7 @@
 from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn
 import time
 import functools
-from typing import Callable, Any
+from typing import Callable, Any, Optional
 import asyncio
 import pprint as pp
 from contextlib import contextmanager
@@ -15,9 +15,10 @@
 # Add global console instance
 console = Console()
 
+
 def pprint(d, indent=4, width=100):
     """Pretty print a dictionary or other object with line width control.
-    
+
     Args:
         d: Dictionary or object to print
         indent: Number of spaces for indentation
@@ -27,22 +28,38 @@ def pprint(d, indent=4, width=100):
     printer = pp.PrettyPrinter(indent=indent, width=width)
     printer.pprint(d)
 
+
 def load_jsonl(filename: Path | str) -> list[dict]:
     """Load a JSONL file into a list of dictionaries."""
     with open(filename, "r") as file:
         return [json.loads(line) for line in file]
-    
+
+
 class BaseModelEncoder(json.JSONEncoder):
     def default(self, obj):
         if isinstance(obj, BaseModel):
             return obj.model_dump()
         return super().default(obj)
 
+
 class NumpyEncoder(json.JSONEncoder):
     def default(self, obj):
-        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
-                          np.int16, np.int32, np.int64, np.uint8,
-                          np.uint16, np.uint32, np.uint64)):
+        if isinstance(
+            obj,
+            (
+                np.int_,
+                np.intc,
+                np.intp,
+                np.int8,
+                np.int16,
+                np.int32,
+                np.int64,
+                np.uint8,
+                np.uint16,
+                np.uint32,
+                np.uint64,
+            ),
+        ):
             return int(obj)
         elif isinstance(obj, (np.float16, np.float32, np.float64)):
             return float(obj)
@@ -53,7 +70,8 @@ def default(self, obj):
         elif isinstance(obj, BaseModel):
             return obj.model_dump()
         return super().default(obj)
-    
+
+
 class SuperEncoder(BaseModelEncoder, NumpyEncoder):
     pass
 
@@ -65,38 +83,37 @@ def save_jsonl(data: list[dict], filename: Path | str):
             json.dump(example, file, cls=SuperEncoder)
             file.write("\n")
 
-def listify(l: list[str]) -> str:
+
+def listify(listable: list[str]) -> str:
     """Creates a markdown list of the items in the list."""
-    if not l:
+    if not listable:
         return "- None"
-    return "\n".join([f"- {item}" for item in l])
+    return "\n".join([f"- {item}" for item in listable])
+
 
 def sanitize_messages(messages: list[dict[str, str]]) -> list[dict[str, str]]:
     """
     Safely process messages for LiteLLM by converting all content to plain strings.
     This prevents issues with class attributes and non-pickleable objects.
-    
+
     Args:
         messages: List of message dictionaries with 'role' and 'content' keys
     Returns:
         List of sanitized message dictionaries
     """
     return [
-        {
-            "role": str(msg["role"]),
-            "content": str(msg["content"])
-        }
-        for msg in messages
+        {"role": str(msg["role"]), "content": str(msg["content"])} for msg in messages
     ]
 
-async def tqdm_gather(coros, desc: str = None, total: int = None):
+
+async def tqdm_gather(coros, desc: Optional[str] = None, total: Optional[int] = None):
     """Create a Rich progress bar for gathering multiple coroutines
-    
+
     Args:
         coros: List of coroutines to execute concurrently
         desc: Description for the progress bar
         total: Total number of steps (defaults to len(coros) if not provided)
-    
+
     Returns:
         List of results from the gathered coroutines
     """
@@ -105,19 +122,19 @@ async def tqdm_gather(coros, desc: str = None, total: int = None):
         *Progress.get_default_columns(),
         TimeElapsedColumn(),
         console=console,
-        transient=True
+        transient=True,
     )
-    
+
     if total is None:
         total = len(coros)
-        
+
     task_id = progress.add_task(f"[bold blue]{desc}", total=total)
-    
+
     async def wrapped_coro(coro):
         result = await coro
         progress.update(task_id, advance=1)
         return result
-    
+
     progress.start()
     try:
         results = await asyncio.gather(*[wrapped_coro(coro) for coro in coros])
@@ -125,6 +142,7 @@ async def wrapped_coro(coro):
     finally:
         progress.stop()
 
+
 # Keep the original tqdm for synchronous operations
 def tqdm(iterable=None, desc: str = None, total: int = None):
     """Create a Rich progress bar for synchronous operations"""
@@ -133,12 +151,12 @@ def tqdm(iterable=None, desc: str = None, total: int = None):
         *Progress.get_default_columns(),
         TimeElapsedColumn(),
         console=console,
-        transient=True
+        transient=True,
     )
     # Use provided total or calculate from coroutines
     if total is None:
         total = len(iterable)
-        
+
     task_id = progress.add_task(f"[bold blue]{desc}", total=total)
     progress.start()
     try:
@@ -148,16 +166,18 @@ def tqdm(iterable=None, desc: str = None, total: int = None):
     finally:
         progress.stop()
 
+
 def timer(func: Callable) -> Callable:
     """Decorator that measures and prints execution time of functions.
     Works with both async and regular functions.
-    
+
     Args:
         func: The function to be timed
-        
+
     Returns:
         Wrapped function that prints its execution time
     """
+
     @functools.wraps(func)
     async def async_wrapper(*args, **kwargs) -> Any:
         start = time.perf_counter()
@@ -165,7 +185,7 @@ async def async_wrapper(*args, **kwargs) -> Any:
         elapsed = time.perf_counter() - start
         console.print(f"[dim](time: {elapsed:.2f}s)[/]")
         return result
-    
+
     @functools.wraps(func)
     def sync_wrapper(*args, **kwargs) -> Any:
         start = time.perf_counter()
@@ -173,20 +193,21 @@ def sync_wrapper(*args, **kwargs) -> Any:
         elapsed = time.perf_counter() - start
         console.print(f"[dim](time: {elapsed:.2f}s)[/]")
         return result
-    
+
     return async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper
 
+
 class Logger:
     def __init__(self):
         self.console = Console()
 
     def rule(self, name: str, color: str = "green") -> None:
         self.console.rule(f"[bold {color}]Begin {name}")
-        
+
     def info(self, message: str):
         """Print an info message"""
         self.console.print(f"[green]► {message}[/]")
-    
+
     def warning(self, message: str):
         """Print a warning message"""
         self.console.print(f"[yellow]► {message}[/]")
@@ -194,21 +215,21 @@ def warning(self, message: str):
     def error(self, message: str):
         """Print an error message"""
         self.console.print(f"[red]► {message}[/]")
-        
+
     def header(self, message: str):
         """Print a header message"""
         self.console.print(f"[bold blue]{message}[/]")
-        
+
     @contextmanager
     def timer(self, message: str = None):
         """Context manager for timing operations
-        
+
         Args:
             message: Optional message to print before timing
         """
         if message:
             self.info(message)
-            
+
         start = time.perf_counter()
         try:
             yield
@@ -216,5 +237,6 @@ def timer(self, message: str = None):
             elapsed = time.perf_counter() - start
             self.console.print(f"[dim](time: {elapsed:.2f}s)[/]")
 
+
 # Create global logger instance
-logger = Logger()
\ No newline at end of file
+logger = Logger()
diff --git a/forge_medical.py b/forge_medical.py
index ef759ee..7a1bde2 100644
--- a/forge_medical.py
+++ b/forge_medical.py
@@ -1,4 +1,3 @@
-
 import asyncio
 import random
 from pathlib import Path
@@ -7,12 +6,15 @@
 
 from evalforge.utils import logger, pprint
 from evalforge.forge import EvalForge
-from evalforge.data_utils import load_data
+from evalforge.data_utils import load_data, DataPoint
+from evalforge.combined_scorer import AssertionScorer
 from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics
 
 weave.init("evalforge_test_judgebench")
 
-ds_formatted = weave.ref("weave:///a-sh0ts/medical_data_results/object/medical_data_annotations:7GcCtWgyPTWtKY48Z7v5VxwCNZXTTTpSMbmubAbyHT8").get()
+ds_formatted = weave.ref(
+    "weave:///a-sh0ts/medical_data_results/object/medical_data_annotations:7GcCtWgyPTWtKY48Z7v5VxwCNZXTTTpSMbmubAbyHT8"
+).get()
 data = random.sample(ds_formatted, 10)
 
 # def to_datapoint(d):
@@ -33,21 +35,28 @@
 NUM_CRITERIA_TO_GENERATE = 3
 
 forger = EvalForge(
-    batch_size=BATCH_SIZE, 
-    num_criteria_to_generate=NUM_CRITERIA_TO_GENERATE, 
-    llm_model=LLM_MODEL
+    batch_size=BATCH_SIZE,
+    num_criteria_to_generate=NUM_CRITERIA_TO_GENERATE,
+    llm_model=LLM_MODEL,
 )
 results = asyncio.run(forger.fit(formatted_data))
 forged_judge = results["forged_judges"]["judge"]
 
+logger.info(f"Forged judge: {forged_judge.model_dump()}")
+
 logger.rule("Running assertions and calculating metrics", color="blue")
 
+
 @weave.op
-async def run_assertions_and_calculate_metrics(forger, judge, data):
+async def run_assertions_and_calculate_metrics(
+    forger: EvalForge, judge: AssertionScorer, data: list[DataPoint]
+):
     all_data_forged_judge_assertion_results = await forger.run_assertions(judge, data)
-    all_data_metrics = calculate_alignment_metrics(all_data_forged_judge_assertion_results)
+    all_data_metrics = calculate_alignment_metrics(
+        all_data_forged_judge_assertion_results
+    )
     format_alignment_metrics(all_data_metrics)
     return
 
-asyncio.run(run_assertions_and_calculate_metrics(
-    forger, forged_judge, formatted_data))
\ No newline at end of file
+
+asyncio.run(run_assertions_and_calculate_metrics(forger, forged_judge, formatted_data))
diff --git a/forge_mini.py b/forge_mini.py
index 9b9647d..4b311d7 100644
--- a/forge_mini.py
+++ b/forge_mini.py
@@ -1,4 +1,3 @@
-
 import asyncio
 
 from evalforge.utils import logger
@@ -7,41 +6,42 @@
 from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics
 
 import weave
+
 weave.init("evalforge_test_judgebench")
 
 
 # train_ds_formatted = [
 #     DataPoint(
-#         input_data={"text": "1+1="}, 
-#         output_data={"text": "2"}, 
-#         annotation=1, 
+#         input_data={"text": "1+1="},
+#         output_data={"text": "2"},
+#         annotation=1,
 #         note="Correct summation",
-#     ), 
+#     ),
 #     DataPoint(
-#         input_data={"text": "1+1="}, 
-#         output_data={"text": "3"}, 
-#         annotation=0, 
+#         input_data={"text": "1+1="},
+#         output_data={"text": "3"},
+#         annotation=0,
 #         note="Incorrect summation",
 #     ),
 #     DataPoint(
-#         input_data={"text": "What is the square root of 16?"}, 
-#         output_data={"text": "4"}, 
-#         annotation=1, 
+#         input_data={"text": "What is the square root of 16?"},
+#         output_data={"text": "4"},
+#         annotation=1,
 #         note="Correct square root",
 #     ),
 # ]
 
 # eval_ds_formatted = [
 #     DataPoint(
-#         input_data={"text": "What is the square root of 16?"}, 
-#         output_data={"text": "4"}, 
-#         annotation=1, 
+#         input_data={"text": "What is the square root of 16?"},
+#         output_data={"text": "4"},
+#         annotation=1,
 #         note="Correct square root",
 #     ),
 #     DataPoint(
-#         input_data={"text": "What is the square root of 16?"}, 
-#         output_data={"text": "3"}, 
-#         annotation=0, 
+#         input_data={"text": "What is the square root of 16?"},
+#         output_data={"text": "3"},
+#         annotation=0,
 #         note="Incorrect square root",
 #     ),
 # ]
@@ -58,12 +58,17 @@
 
 logger.rule("Running assertions and calculating metrics", color="blue")
 
+
 @weave.op
 async def run_assertions_and_calculate_metrics(forger, judge, data):
     all_data_forged_judge_assertion_results = await forger.run_assertions(judge, data)
-    all_data_metrics = calculate_alignment_metrics(all_data_forged_judge_assertion_results)
+    all_data_metrics = calculate_alignment_metrics(
+        all_data_forged_judge_assertion_results
+    )
     format_alignment_metrics(all_data_metrics)
     return
 
-asyncio.run(run_assertions_and_calculate_metrics(
-    forger, forged_judge, eval_ds_formatted))
\ No newline at end of file
+
+asyncio.run(
+    run_assertions_and_calculate_metrics(forger, forged_judge, eval_ds_formatted)
+)
diff --git a/tests/test_code_formatter.py b/tests/test_code_formatter.py
index 1e114ef..c6f3d63 100644
--- a/tests/test_code_formatter.py
+++ b/tests/test_code_formatter.py
@@ -5,57 +5,66 @@
 from evalforge.code_formatter import CodeFormatter
 from evalforge.instructor_models import PythonAssertion
 
+
 @pytest.fixture
 def code_formatter():
     return CodeFormatter()
 
+
 @pytest.fixture
 def sample_assertions():
     return {
-        "within_word_limit": textwrap.dedent("""
+        "within_word_limit": textwrap.dedent(
+            """
             def test_within_word_limit(self):
                 # Count words in output
                 total_words = sum(len(str(value).split()) for value in self.output['output'].split('\\n'))
                 self.assertLessEqual(total_words, 150, f"Output exceeds word limit with {total_words} words.")
-        """).strip(),
-        "essential_information_inclusion": textwrap.dedent("""
+        """
+        ).strip(),
+        "essential_information_inclusion": textwrap.dedent(
+            """
             def test_essential_information_inclusion(self):
                 # Check for the presence of essential keys
                 essential_keys = ['chief complaint', 'history of present illness', 'physical examination']
                 output_text = self.output['output'].lower()
                 for key in essential_keys:
                     self.assertIn(key, output_text, f"Output is missing essential information: {key}.")
-        """).strip(),
+        """
+        ).strip(),
     }
 
+
 def test_lint_code(code_formatter):
-    sample_code = textwrap.dedent("""
+    sample_code = textwrap.dedent(
+        """
         def test_function(self):
             output_text = self.output['output']
             self.assertIsInstance(output_text,    str)
             self.assertTrue(output_text.strip( ))
-    """).strip()
+    """
+    ).strip()
     formatted_code = code_formatter.lint_code(sample_code)
     # Verify no syntax errors in formatted code
     ast.parse(formatted_code)
     # Check basic formatting
     assert "strip()" in formatted_code  # removed extra spaces
 
+
 def test_write_assertions_to_files(code_formatter, sample_assertions, tmp_path):
     # Write assertions to files
     base_dir = code_formatter.write_assertions_to_files(
-        [PythonAssertion(
-            test_name=name,
-            code=code,
-            evaluation_type="python"
-        ) for name, code in sample_assertions.items()],
-        base_dir=str(tmp_path)
+        [
+            PythonAssertion(test_name=name, code=code, evaluation_type="python")
+            for name, code in sample_assertions.items()
+        ],
+        base_dir=str(tmp_path),
     )
-    
+
     # Check if test files are created with correct content
     for assertion_name in sample_assertions:
         test_file = os.path.join(base_dir, "tests", f"test_{assertion_name}.py")
         assert os.path.exists(test_file)
-        with open(test_file, 'r') as f:
+        with open(test_file, "r") as f:
             content = f.read()
-            assert f"class Test_{assertion_name}(OutputTestCase):" in content
\ No newline at end of file
+            assert f"class Test_{assertion_name}(OutputTestCase):" in content
diff --git a/tests/test_combined_scorer.py b/tests/test_combined_scorer.py
index 264d847..cf8d887 100644
--- a/tests/test_combined_scorer.py
+++ b/tests/test_combined_scorer.py
@@ -3,7 +3,7 @@
 import weave
 from evalforge.combined_scorer import AssertionScorer, predict_passthrough
 from evalforge.criterion_assertion_map import CriterionAssertionMap
-from evalforge.instructor_models import LLMAssertion, PythonAssertion
+from evalforge.instructor_models import LLMAssertion, PythonAssertion, Criterion
 
 weave.init("combined_scorer_test")
 
@@ -27,6 +27,7 @@
             "bullet points starting with the key. Based on this assessment, respond with 'PASS' if all criteria "
             "are met, otherwise 'FAIL'."
         ),
+        evaluation_type="llm",
     ),
     LLMAssertion(
         test_name="conciseness_and_privacy_compliance",
@@ -35,6 +36,7 @@
             "key information effectively within 150 words while ensuring no personal identifiable information (PII) "
             "like name, age, gender, or ID is present? Provide your assessment as PASS for compliance or FAIL otherwise."
         ),
+        evaluation_type="llm",
     ),
     # Python Assertions
     PythonAssertion(
@@ -53,6 +55,7 @@ def test_essential_information_inclusion(self):
     for key in essential_keys:
         self.assertIn(key, output_text, f"Output is missing essential information: {key}.")
         """,
+        evaluation_type="python",
     ),
     PythonAssertion(
         test_name="no_excessive_information",
@@ -63,9 +66,23 @@ def test_no_excessive_information(self):
     for term in disallowed_terms:
         self.assertNotIn(term, output_text, f"Output contains disallowed information: {term}.")
         """,
+        evaluation_type="python",
     ),
 ]
 
+# Create criteria and map assertions to them
+criterion_assertion_map = CriterionAssertionMap()
+criteria = [
+    Criterion(criterion="Completeness and Accuracy", evaluation_method="llm"),
+    Criterion(criterion="Privacy and Formatting", evaluation_method="llm"),
+]
+
+# Map assertions to criteria
+criterion_assertion_map.add_assertion(criteria[0], assertions[0])
+criterion_assertion_map.add_assertion(criteria[0], assertions[2])
+criterion_assertion_map.add_assertion(criteria[1], assertions[1])
+criterion_assertion_map.add_assertion(criteria[1], assertions[3])
+
 # Examples
 examples = [
     # Example 1
@@ -164,12 +181,14 @@ def test_no_excessive_information(self):
     },
 ]
 
+
 @pytest.mark.asyncio
 async def test_combined_scorer():
-        # Initialize the AssertionScorer with the assertions
+    # Initialize the AssertionScorer with the mapped assertions
     scorer = AssertionScorer(
-        assertions=CriterionAssertionMap.from_assertions(assertions),
+        criterion_assertion_map=criterion_assertion_map,
         llm_model="gpt-4o",
+        task_description="Transform a dialogue between a doctor and a patient into a structured medical note summary.",
         prompt_template="""
     Task Description:
     {task_description}
@@ -198,3 +217,24 @@ async def test_combined_scorer():
 
     results = await evaluation.evaluate(predict_passthrough)
 
+    # Verify the structure of results
+    assert results is not None
+    assert "AssertionScorer" in results
+    scorer_results = results["AssertionScorer"]
+
+    # Check for criteria presence
+    assert "Completeness and Accuracy" in scorer_results
+    assert "Privacy and Formatting" in scorer_results
+
+    # Check the structure of each criterion's results
+    for criterion in ["Completeness and Accuracy", "Privacy and Formatting"]:
+        criterion_results = scorer_results[criterion]
+        assert len(criterion_results) > 0
+
+        for assertion_name, result in criterion_results.items():
+            assert "score" in result
+            assert isinstance(result["score"], dict)
+            assert "mean" in result["score"]
+
+    # Optionally verify that model latency is present
+    assert "model_latency" in results
diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py
index 022b443..d7f8b93 100644
--- a/tests/test_data_utils.py
+++ b/tests/test_data_utils.py
@@ -2,24 +2,31 @@
 from evalforge.data_utils import generate_mapping
 from evalforge.instructor_models import DatasetMapping
 
+
 @pytest.fixture
 def list_sample():
     return [
         {"input": "What is the capital of France?"},
-        {"output": "Paris is the capital of France.", "score": 1, "feedback": "Good answer, accurate and concise."},
+        {
+            "output": "Paris is the capital of France.",
+            "score": 1,
+            "feedback": "Good answer, accurate and concise.",
+        },
         1,
-        "Good answer, accurate and concise."
+        "Good answer, accurate and concise.",
     ]
 
+
 @pytest.fixture
 def dict_sample():
     return {
         "question": "What is the capital of France?",
         "response": "Paris is the capital of France.",
         "score": 1,
-        "feedback": "Good answer, accurate and concise."
+        "feedback": "Good answer, accurate and concise.",
     }
 
+
 @pytest.fixture
 def nested_dict_sample():
     return {
@@ -28,11 +35,12 @@ def nested_dict_sample():
             "model_output": "Paris is the capital of France.",
             "evaluation": {
                 "is_correct": 1,
-                "reviewer_notes": "Good answer, accurate and concise."
-            }
+                "reviewer_notes": "Good answer, accurate and concise.",
+            },
         }
     }
 
+
 def test_generate_mapping_list_structure(list_sample):
     mapping = generate_mapping(list_sample)
     assert isinstance(mapping, dict)
@@ -40,35 +48,45 @@ def test_generate_mapping_list_structure(list_sample):
     assert "output_data" in mapping
     assert "annotation" in mapping
     assert "note" in mapping
-    
+
     # Test the mapping works with DataPoint.from_example
     from evalforge.data_utils import DataPoint
+
     data_point = DataPoint.from_example(list_sample, mapping)
     assert data_point is not None
     assert data_point.annotation in [0, 1]
 
+
 def test_generate_mapping_dict_structure(dict_sample):
     mapping = generate_mapping(dict_sample)
     assert isinstance(mapping, dict)
-    assert all(key in mapping for key in ["input_data", "output_data", "annotation", "note"])
-    
+    assert all(
+        key in mapping for key in ["input_data", "output_data", "annotation", "note"]
+    )
+
     # Verify the mapping matches expected structure
     from evalforge.data_utils import DataPoint
+
     data_point = DataPoint.from_example(dict_sample, mapping)
     assert data_point is not None
     assert data_point.annotation in [0, 1]
 
+
 def test_generate_mapping_nested_structure(nested_dict_sample):
     mapping = generate_mapping(nested_dict_sample)
     assert isinstance(mapping, dict)
-    assert all(key in mapping for key in ["input_data", "output_data", "annotation", "note"])
-    
+    assert all(
+        key in mapping for key in ["input_data", "output_data", "annotation", "note"]
+    )
+
     # Verify the mapping works with nested structures
     from evalforge.data_utils import DataPoint
+
     data_point = DataPoint.from_example(nested_dict_sample, mapping)
     assert data_point is not None
     assert data_point.annotation in [0, 1]
 
+
 def test_generate_mapping_invalid_input():
     with pytest.raises(Exception):
         generate_mapping(None)
@@ -77,13 +95,14 @@ def test_generate_mapping_invalid_input():
     with pytest.raises(Exception):
         generate_mapping({})
 
+
 def test_mapping_model_validation():
     # Test that the mapping follows DatasetMapping model
     sample_mapping = {
         "input_data": "question",
         "output_data": "response",
         "annotation": "score",
-        "note": "feedback"
+        "note": "feedback",
     }
     mapping_model = DatasetMapping(**sample_mapping)
-    assert mapping_model.model_dump() == sample_mapping
\ No newline at end of file
+    assert mapping_model.model_dump() == sample_mapping
diff --git a/tests/test_llm_evaluator.py b/tests/test_llm_evaluator.py
index 7d69c1e..0b62760 100644
--- a/tests/test_llm_evaluator.py
+++ b/tests/test_llm_evaluator.py
@@ -2,6 +2,7 @@
 from evalforge.llm_evaluator import LLMAssertionScorer
 from evalforge.instructor_models import LLMAssertion
 
+
 @pytest.fixture
 def assertions():
     return [
@@ -33,10 +34,12 @@ def assertions():
         ),
     ]
 
+
 @pytest.fixture
 def task_description():
     return "Transform a dialogue between a doctor and a patient into a structured medical note summary, adhering to privacy guidelines and specified formatting instructions."
 
+
 @pytest.fixture
 def input_data():
     return {
@@ -51,6 +54,7 @@ def input_data():
         )
     }
 
+
 @pytest.fixture
 def model_output():
     return {
@@ -65,18 +69,21 @@ def model_output():
         )
     }
 
+
 @pytest.mark.asyncio
-async def test_llm_assertion_scorer(assertions, task_description, input_data, model_output):
+async def test_llm_assertion_scorer(
+    assertions, task_description, input_data, model_output
+):
     scorer = LLMAssertionScorer(assertions=assertions)
-    results = await scorer.score(model_output, task_description, input_data)
-    
+    results = await scorer.score(
+        model_output=model_output,
+        input_data=input_data,
+        task_description=task_description,
+    )
+
     assert "llm_assertion_results" in results
     assert len(results["llm_assertion_results"]) == len(assertions)
-    
-    for test_name, result in results["llm_assertion_results"].items():
-        assert "score" in result
-        assert "result" in result
-        assert "type" in result
-        assert result["type"] == "llm"
-        assert result["score"] in [0, 1]
-        assert result["result"] in ["PASS", "FAIL"] 
\ No newline at end of file
+
+    for test_name, score in results["llm_assertion_results"].items():
+        assert isinstance(score, int)
+        assert score in [0, 1]
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 694c2bc..96fddba 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,80 +12,84 @@
     SuperEncoder,
 )
 
+
 # Test data
 class TestModel(BaseModel):
     name: str
     value: int
 
+
 @pytest.fixture
 def temp_jsonl(tmp_path):
     file_path = tmp_path / "test.jsonl"
-    test_data = [
-        {"a": 1, "b": 2},
-        {"c": 3, "d": 4}
-    ]
+    test_data = [{"a": 1, "b": 2}, {"c": 3, "d": 4}]
     with open(file_path, "w") as f:
         for item in test_data:
             f.write(json.dumps(item) + "\n")
     return file_path, test_data
 
+
 def test_load_jsonl(temp_jsonl):
     file_path, expected_data = temp_jsonl
     loaded_data = load_jsonl(file_path)
     assert loaded_data == expected_data
 
+
 def test_save_jsonl(tmp_path):
     file_path = tmp_path / "output.jsonl"
     test_data = [
         {"normal": "data"},
         {"numpy": np.int64(42)},
         {"array": np.array([1, 2, 3])},
-        {"pydantic": TestModel(name="test", value=1)}
+        {"pydantic": TestModel(name="test", value=1)},
     ]
-    
+
     save_jsonl(test_data, file_path)
     loaded_data = load_jsonl(file_path)
-    
+
     assert loaded_data[0] == {"normal": "data"}
     assert loaded_data[1] == {"numpy": 42}
     assert loaded_data[2] == {"array": [1, 2, 3]}
     assert loaded_data[3] == {"pydantic": {"name": "test", "value": 1}}
 
+
 def test_listify():
     # Test empty list
     assert listify([]) == "- None"
-    
+
     # Test normal list
     items = ["apple", "banana", "orange"]
     expected = "- apple\n- banana\n- orange"
     assert listify(items) == expected
 
+
 def test_super_encoder():
     encoder = SuperEncoder()
-    
+
     # Test numpy types
     assert pytest.approx(encoder.default(np.int64(42))) == 42
     assert pytest.approx(encoder.default(np.float32(3.14))) == 3.14
     assert encoder.default(np.bool_(True)) == True
     assert pytest.approx(encoder.default(np.array([1, 2, 3]))) == [1, 2, 3]
-    
+
     # Test pydantic model
     model = TestModel(name="test", value=1)
     assert encoder.default(model) == {"name": "test", "value": 1}
-    
+
     # Test unsupported type
     with pytest.raises(TypeError):
         encoder.default(set())
 
+
 def test_pprint(capsys):
     # Test dictionary printing
     test_dict = {"a": 1, "b": 2}
     pprint(test_dict)
     captured = capsys.readouterr()
-    assert json.loads(captured.out) == test_dict
-    
-    # Test non-dictionary printing
+    assert eval(captured.out.strip()) == test_dict
+
+    # Test string printing
     test_str = "Hello, World!"
     pprint(test_str)
     captured = capsys.readouterr()
-    assert captured.out.strip() == test_str
\ No newline at end of file
+    assert captured.out.strip() == f"'{test_str}'"