lint

wandb · Nov 8, 2024 · 60c1c44 · 60c1c44
1 parent d1e6208
commit 60c1c44
Show file tree

Hide file tree

Showing 21 changed files with 548 additions and 363 deletions.
diff --git a/evalforge/__init__.py b/evalforge/__init__.py
@@ -1,2 +1,5 @@
 from evalforge.forge import EvalForge
-from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics
+from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics
+
+
+__all__ = ["EvalForge", "calculate_alignment_metrics", "format_alignment_metrics"]
diff --git a/evalforge/alignment.py b/evalforge/alignment.py
@@ -168,15 +168,13 @@ def calculate_alignment_metrics(
 
 
 def select_best_assertions(
-    metrics: Dict[str, Any],
-    assertion_results: Dict[str, Dict[str, List[Tuple[Dict[str, Any], int]]]],
-    num_assertions_per_criterion: int = None,
-) -> Dict[str, Dict[str, str]]:
-
-    best_assertions = {}
+    criterion_assertion_results: Dict[str, Dict[str, Dict[str, Any]]],
+    num_assertions_per_criterion: Optional[int] = None,
+) -> List[str]:
+    best_subset: List[str] = []
 
-    for criterion in assertion_results.keys():
-        all_assertions = list(assertion_results[criterion].keys())
+    for criterion in criterion_assertion_results.keys():
+        all_assertions = list(criterion_assertion_results[criterion].keys())
 
         if not num_assertions_per_criterion:
             # Intelligently select the subset of assertions that maximize the criterion's alignment score
@@ -192,7 +190,7 @@ def select_best_assertions(
                     # Create subset of assertion_results
                     subset_assertion_results = {
                         criterion: {
-                            assertion: assertion_results[criterion][assertion]
+                            assertion: criterion_assertion_results[criterion][assertion]
                             for assertion in subset
                         }
                     }
@@ -305,15 +303,15 @@ def format_alignment_metrics(metrics, title: str = "Alignment Metrics"):
             criterion[:40].ljust(40),
             "",
             "",
-            f"{criterion_data['criterion_metrics']['alignment']:.2f}"
+            f"{criterion_data['criterion_metrics']['alignment']:.2f}",
         )
         # Add rows for each assertion
         for assertion, assertion_data in criterion_data["per_assertion"].items():
             table.add_row(
                 "",
                 assertion[:40].ljust(40),
                 assertion_data["type"].ljust(9),
-                f"{assertion_data['alignment']:.2f}"
+                f"{assertion_data['alignment']:.2f}",
             )
 
     # Print the table using the logger's console

diff --git a/evalforge/cli.py b/evalforge/cli.py
@@ -1,80 +1,56 @@
 import asyncio
+from typing import Optional
 import simple_parsing
 from simple_parsing import Serializable
-from dataclasses import dataclass, Field
+from dataclasses import dataclass
 import sys
 
+import weave
+
 from evalforge.forge import EvalForge
 from evalforge.utils import logger
-from evalforge.data_utils import load_data, DataPoint
+from evalforge.data_utils import load_data
 
-train_ds_formatted = [
-    DataPoint(
-        input_data={"text": "1+1="}, 
-        output_data={"text": "2"}, 
-        annotation=1, 
-        note="Correct summation",
-    ), 
-    DataPoint(
-        input_data={"text": "1+1="}, 
-        output_data={"text": "3"}, 
-        annotation=0, 
-        note="Incorrect summation",
-    ),
-    DataPoint(
-        input_data={"text": "What is the square root of 16?"}, 
-        output_data={"text": "4"}, 
-        annotation=1, 
-        note="Correct square root",
-    ),
-]
+MINI_DATASET_PATH = "data/mini_data.jsonl"
 
-eval_ds_formatted = [
-    DataPoint(
-        input_data={"text": "What is the square root of 16?"}, 
-        output_data={"text": "4"}, 
-        annotation=1, 
-        note="Correct square root",
-    ),
-    DataPoint(
-        input_data={"text": "What is the square root of 16?"}, 
-        output_data={"text": "3"}, 
-        annotation=0, 
-        note="Incorrect square root",
-    ),
-]
 
 @dataclass
 class Args(Serializable):
-    data: str = "mini"# "Path to training data"
-    batch_size: int = 1 # "Batch size"
-    num_criteria_to_generate: int = 1 # "Number of criteria to generate"
-    llm_model: str = "gpt-4o" # "LLM model to use"
+    data: str = "mini"  # "Path to training data"
+    batch_size: int = 1  # "Batch size"
+    num_criteria_to_generate: int = 1  # "Number of criteria to generate"
+    llm_model: str = "gpt-4o"  # "LLM model to use"
+    weave_project: Optional[str] = None  # "Weave project to use"
+
 
 def forge():
     logger.rule("EvalForge CLI")
     try:
         args = simple_parsing.parse(Args)
 
-        # Load the data
+        # Log into Weave
+        if args.weave_project:
+            weave.init(args.weave_project)
 
+        # Load the data
         if args.data == "mini":
-            logger.info(f"Running dummy data")
-            train_data = train_ds_formatted
+            logger.info("Running dummy data")
+            train_data = load_data(MINI_DATASET_PATH)
         else:
             logger.info(f"Loading data from {args.data}")
             train_data = load_data(args.data)
-        
+
         forger = EvalForge(
-            batch_size=args.batch_size, 
-            num_criteria_to_generate=args.num_criteria_to_generate, 
-            llm_model=args.llm_model
+            batch_size=args.batch_size,
+            num_criteria_to_generate=args.num_criteria_to_generate,
+            llm_model=args.llm_model,
         )
         # Run the fit method
         asyncio.run(forger.fit(train_data))
     except Exception as e:
         print(f"An error occurred: {e}")
         sys.exit(1)
 
+
 if __name__ == "__main__":
-    forge() 
+    forge()
diff --git a/evalforge/code_evaluator.py b/evalforge/code_evaluator.py
@@ -22,17 +22,19 @@ def score(
         self,
         model_output: Optional[Dict[str, Any]],
         input_data: Dict[str, Any],
-        **kwargs
+        **kwargs,
     ) -> Dict[str, Any]:
         if model_output is None:
             logger.error("No model output provided")
-            return {"code_assertion_results": {
-                "tests_run": 0,
-                "passed": 0,
-                "failures": 0,
-                "errors": 0,
-                "test_results": {}
-            }}
+            return {
+                "code_assertion_results": {
+                    "tests_run": 0,
+                    "passed": 0,
+                    "failures": 0,
+                    "errors": 0,
+                    "test_results": {},
+                }
+            }
 
         try:
             # Use the code_formatter to write assertions to files
@@ -55,10 +57,14 @@ def run_tests(self, temp_dir: str, output: Any) -> str:
         import json
 
         # Create a test context with both the model output and input data
-        test_context = json.dumps({
-            "output": output,
-            "input": output.get("input_data", {}) if isinstance(output, dict) else {}
-        })
+        test_context = json.dumps(
+            {
+                "output": output,
+                "input": (
+                    output.get("input_data", {}) if isinstance(output, dict) else {}
+                ),
+            }
+        )
 
         # Run the test suite using subprocess and capture the output
         result = subprocess.run(
@@ -114,4 +120,4 @@ def parse_test_results(self, test_output: str) -> Dict[str, Any]:
             "failures": failures,
             "errors": errors,
             "test_results": test_result_dict,
-        }
+        }
diff --git a/evalforge/code_formatter.py b/evalforge/code_formatter.py
@@ -3,7 +3,7 @@
 import os
 import textwrap
 from datetime import datetime
-from typing import Dict, Optional, Set
+from typing import Optional, Set
 
 import autopep8
 import isort
@@ -12,6 +12,7 @@
 
 from evalforge.instructor_models import PythonAssertion
 
+
 class CodeFormatter(weave.Object):
     @weave.op
     def lint_code(self, code: str) -> str:
@@ -27,8 +28,8 @@ def lint_code(self, code: str) -> str:
         code = autopep8.fix_code(code, options={"aggressive": 2})
         return code
 
-    def get_required_imports(self, tree: ast.AST) -> Set[tuple]:
-        required_imports = set()
+    def get_required_imports(self, tree: ast.AST) -> Set[tuple[Optional[str], str]]:
+        required_imports: Set[tuple[Optional[str], str]] = set()
         for node in ast.walk(tree):
             if isinstance(node, ast.Name):
                 if not self.is_builtin(node.id):
@@ -61,11 +62,11 @@ def write_assertions_to_files(
         self, assertions: list[PythonAssertion], base_dir: Optional[str] = None
     ) -> str:
         """Write assertions to test files in the specified directory.
-        
+
         Args:
             assertions: List of PythonAssertion objects
             base_dir: Optional directory to write files to. If None, creates a timestamped directory
-            
+
         Returns:
             str: Path to the base directory containing the generated files
         """
@@ -105,7 +106,7 @@ def create_test_file_content(self, assertion_name: str, assertion_code: str) ->
         dedented_assertion_code = textwrap.dedent(assertion_code).strip()
         # Re-indent the assertion code to match the class indentation (4 spaces)
         indented_assertion_code = textwrap.indent(dedented_assertion_code, "    ")
-        
+
         return f"""{imports}
 
 class Test_{assertion_name}(OutputTestCase):
@@ -161,4 +162,4 @@ def load_tests(loader, standard_tests, pattern):
     # Exit with non-zero status if there were failures
     if not unittest.TextTestRunner().run(load_tests(None, None, None)).wasSuccessful():
         sys.exit(1)
-"""
+"""
diff --git a/evalforge/combined_scorer.py b/evalforge/combined_scorer.py
@@ -1,13 +1,11 @@
-import asyncio
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
 
 import weave
 from pydantic import Field
 
 from evalforge.code_evaluator import CodeAssertionScorer, CodeFormatter
-from evalforge.instructor_models import (Criterion, LLMAssertion,
-                                         PythonAssertion)
+from evalforge.instructor_models import Criterion, LLMAssertion, PythonAssertion
 from evalforge.llm import DEFAULT_LLM_MODEL
 from evalforge.llm_evaluator import LLMAssertionScorer
 from evalforge.criterion_assertion_map import CriterionAssertionMap
@@ -21,12 +19,6 @@ def predict_passthrough(
     return model_output
 
 
-from typing import Any, Dict
-
-import weave
-
-
-
 class AssertionScorer(weave.Scorer):
     criterion_assertion_map: CriterionAssertionMap = Field(
         default_factory=CriterionAssertionMap
@@ -73,11 +65,13 @@ async def score(
                 system_prompt=self.system_prompt,
             )
             llm_results = await llm_scorer.score(
-                model_output=model_output, 
+                model_output=model_output,
                 input_data=input_data,
                 task_description=self.task_description,
             )
-            results["llm_assertion_results"] = llm_results.get("llm_assertion_results", {})
+            results["llm_assertion_results"] = llm_results.get(
+                "llm_assertion_results", {}
+            )
 
         # Process Python assertions
         if python_assertions:
@@ -86,34 +80,36 @@ async def score(
                 code_formatter=self.code_formatter,
             )
             code_results = code_scorer.score(
-                model_output=model_output, 
-                input_data=input_data
+                model_output=model_output, input_data=input_data
             )
-            results["code_assertion_results"] = code_results.get("code_assertion_results", {}).get("test_results", {})
+            results["code_assertion_results"] = code_results.get(
+                "code_assertion_results", {}
+            ).get("test_results", {})
 
         # Map results back to criteria using the mapping class
         criterion_results: Dict[str, Dict[str, Any]] = {}
         for test_name, result in results.get("llm_assertion_results", {}).items():
-            criterion = self.criterion_assertion_map.get_criterion_by_assertion(test_name)
+            criterion = self.criterion_assertion_map.get_criterion_by_assertion(
+                test_name
+            )
             if criterion not in criterion_results:
                 criterion_results[criterion] = {}
-            criterion_results[criterion][test_name] = {
-                "score": result,
-                "type": "llm"
-            }
+            criterion_results[criterion][test_name] = {"score": result, "type": "llm"}
 
         for test_name, result in results.get("code_assertion_results", {}).items():
-            criterion = self.criterion_assertion_map.get_criterion_by_assertion(test_name)
+            criterion = self.criterion_assertion_map.get_criterion_by_assertion(
+                test_name
+            )
             if criterion not in criterion_results:
                 criterion_results[criterion] = {}
             criterion_results[criterion][test_name] = {
                 "score": result["score"],
-                "type": "python"
+                "type": "python",
             }
 
         return criterion_results
 
-    def export(self, base_dir: str = "forged_judge"):
+    def export(self, base_dir: Union[str, Path] = "forged_judge") -> None:
         base_dir = Path(base_dir)
         llm_dir = base_dir / "llm_assertions"
         python_dir = base_dir / "python_assertions"
@@ -172,7 +168,12 @@ def import_assertions(self, base_dir: str = "forged_judge"):
             criterion = item["criterion"]
             assertion = item["assertion"]
             self.criterion_assertion_map.add_assertion(
-                Criterion(criterion=criterion), assertion
+                Criterion(
+                    criterion=criterion,
+                    explanation="Imported criterion",
+                    evaluation_method="mixed",
+                ),
+                assertion,
             )
 
     def load_assertions_by_criteria(self, base_dir: Path, assertion_cls):