Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
tcapelle committed Nov 8, 2024
1 parent d1e6208 commit 60c1c44
Show file tree
Hide file tree
Showing 21 changed files with 548 additions and 363 deletions.
5 changes: 4 additions & 1 deletion evalforge/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
from evalforge.forge import EvalForge
from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics
from evalforge.alignment import calculate_alignment_metrics, format_alignment_metrics


__all__ = ["EvalForge", "calculate_alignment_metrics", "format_alignment_metrics"]
20 changes: 9 additions & 11 deletions evalforge/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,15 +168,13 @@ def calculate_alignment_metrics(


def select_best_assertions(
metrics: Dict[str, Any],
assertion_results: Dict[str, Dict[str, List[Tuple[Dict[str, Any], int]]]],
num_assertions_per_criterion: int = None,
) -> Dict[str, Dict[str, str]]:

best_assertions = {}
criterion_assertion_results: Dict[str, Dict[str, Dict[str, Any]]],
num_assertions_per_criterion: Optional[int] = None,
) -> List[str]:
best_subset: List[str] = []

for criterion in assertion_results.keys():
all_assertions = list(assertion_results[criterion].keys())
for criterion in criterion_assertion_results.keys():
all_assertions = list(criterion_assertion_results[criterion].keys())

if not num_assertions_per_criterion:
# Intelligently select the subset of assertions that maximize the criterion's alignment score
Expand All @@ -192,7 +190,7 @@ def select_best_assertions(
# Create subset of assertion_results
subset_assertion_results = {
criterion: {
assertion: assertion_results[criterion][assertion]
assertion: criterion_assertion_results[criterion][assertion]
for assertion in subset
}
}
Expand Down Expand Up @@ -305,15 +303,15 @@ def format_alignment_metrics(metrics, title: str = "Alignment Metrics"):
criterion[:40].ljust(40),
"",
"",
f"{criterion_data['criterion_metrics']['alignment']:.2f}"
f"{criterion_data['criterion_metrics']['alignment']:.2f}",
)
# Add rows for each assertion
for assertion, assertion_data in criterion_data["per_assertion"].items():
table.add_row(
"",
assertion[:40].ljust(40),
assertion_data["type"].ljust(9),
f"{assertion_data['alignment']:.2f}"
f"{assertion_data['alignment']:.2f}",
)

# Print the table using the logger's console
Expand Down
72 changes: 24 additions & 48 deletions evalforge/cli.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,56 @@
import asyncio
from typing import Optional
import simple_parsing
from simple_parsing import Serializable
from dataclasses import dataclass, Field
from dataclasses import dataclass
import sys

import weave

from evalforge.forge import EvalForge
from evalforge.utils import logger
from evalforge.data_utils import load_data, DataPoint
from evalforge.data_utils import load_data

train_ds_formatted = [
DataPoint(
input_data={"text": "1+1="},
output_data={"text": "2"},
annotation=1,
note="Correct summation",
),
DataPoint(
input_data={"text": "1+1="},
output_data={"text": "3"},
annotation=0,
note="Incorrect summation",
),
DataPoint(
input_data={"text": "What is the square root of 16?"},
output_data={"text": "4"},
annotation=1,
note="Correct square root",
),
]
MINI_DATASET_PATH = "data/mini_data.jsonl"

eval_ds_formatted = [
DataPoint(
input_data={"text": "What is the square root of 16?"},
output_data={"text": "4"},
annotation=1,
note="Correct square root",
),
DataPoint(
input_data={"text": "What is the square root of 16?"},
output_data={"text": "3"},
annotation=0,
note="Incorrect square root",
),
]

@dataclass
class Args(Serializable):
data: str = "mini"# "Path to training data"
batch_size: int = 1 # "Batch size"
num_criteria_to_generate: int = 1 # "Number of criteria to generate"
llm_model: str = "gpt-4o" # "LLM model to use"
data: str = "mini" # "Path to training data"
batch_size: int = 1 # "Batch size"
num_criteria_to_generate: int = 1 # "Number of criteria to generate"
llm_model: str = "gpt-4o" # "LLM model to use"
weave_project: Optional[str] = None # "Weave project to use"


def forge():
logger.rule("EvalForge CLI")
try:
args = simple_parsing.parse(Args)

# Load the data
# Log into Weave
if args.weave_project:
weave.init(args.weave_project)

# Load the data
if args.data == "mini":
logger.info(f"Running dummy data")
train_data = train_ds_formatted
logger.info("Running dummy data")
train_data = load_data(MINI_DATASET_PATH)
else:
logger.info(f"Loading data from {args.data}")
train_data = load_data(args.data)

forger = EvalForge(
batch_size=args.batch_size,
num_criteria_to_generate=args.num_criteria_to_generate,
llm_model=args.llm_model
batch_size=args.batch_size,
num_criteria_to_generate=args.num_criteria_to_generate,
llm_model=args.llm_model,
)
# Run the fit method
asyncio.run(forger.fit(train_data))
except Exception as e:
print(f"An error occurred: {e}")
sys.exit(1)


if __name__ == "__main__":
forge()
forge()
32 changes: 19 additions & 13 deletions evalforge/code_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,19 @@ def score(
self,
model_output: Optional[Dict[str, Any]],
input_data: Dict[str, Any],
**kwargs
**kwargs,
) -> Dict[str, Any]:
if model_output is None:
logger.error("No model output provided")
return {"code_assertion_results": {
"tests_run": 0,
"passed": 0,
"failures": 0,
"errors": 0,
"test_results": {}
}}
return {
"code_assertion_results": {
"tests_run": 0,
"passed": 0,
"failures": 0,
"errors": 0,
"test_results": {},
}
}

try:
# Use the code_formatter to write assertions to files
Expand All @@ -55,10 +57,14 @@ def run_tests(self, temp_dir: str, output: Any) -> str:
import json

# Create a test context with both the model output and input data
test_context = json.dumps({
"output": output,
"input": output.get("input_data", {}) if isinstance(output, dict) else {}
})
test_context = json.dumps(
{
"output": output,
"input": (
output.get("input_data", {}) if isinstance(output, dict) else {}
),
}
)

# Run the test suite using subprocess and capture the output
result = subprocess.run(
Expand Down Expand Up @@ -114,4 +120,4 @@ def parse_test_results(self, test_output: str) -> Dict[str, Any]:
"failures": failures,
"errors": errors,
"test_results": test_result_dict,
}
}
15 changes: 8 additions & 7 deletions evalforge/code_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import textwrap
from datetime import datetime
from typing import Dict, Optional, Set
from typing import Optional, Set

import autopep8
import isort
Expand All @@ -12,6 +12,7 @@

from evalforge.instructor_models import PythonAssertion


class CodeFormatter(weave.Object):
@weave.op
def lint_code(self, code: str) -> str:
Expand All @@ -27,8 +28,8 @@ def lint_code(self, code: str) -> str:
code = autopep8.fix_code(code, options={"aggressive": 2})
return code

def get_required_imports(self, tree: ast.AST) -> Set[tuple]:
required_imports = set()
def get_required_imports(self, tree: ast.AST) -> Set[tuple[Optional[str], str]]:
required_imports: Set[tuple[Optional[str], str]] = set()
for node in ast.walk(tree):
if isinstance(node, ast.Name):
if not self.is_builtin(node.id):
Expand Down Expand Up @@ -61,11 +62,11 @@ def write_assertions_to_files(
self, assertions: list[PythonAssertion], base_dir: Optional[str] = None
) -> str:
"""Write assertions to test files in the specified directory.
Args:
assertions: List of PythonAssertion objects
base_dir: Optional directory to write files to. If None, creates a timestamped directory
Returns:
str: Path to the base directory containing the generated files
"""
Expand Down Expand Up @@ -105,7 +106,7 @@ def create_test_file_content(self, assertion_name: str, assertion_code: str) ->
dedented_assertion_code = textwrap.dedent(assertion_code).strip()
# Re-indent the assertion code to match the class indentation (4 spaces)
indented_assertion_code = textwrap.indent(dedented_assertion_code, " ")

return f"""{imports}
class Test_{assertion_name}(OutputTestCase):
Expand Down Expand Up @@ -161,4 +162,4 @@ def load_tests(loader, standard_tests, pattern):
# Exit with non-zero status if there were failures
if not unittest.TextTestRunner().run(load_tests(None, None, None)).wasSuccessful():
sys.exit(1)
"""
"""
49 changes: 25 additions & 24 deletions evalforge/combined_scorer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import asyncio
from pathlib import Path
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, Union

import weave
from pydantic import Field

from evalforge.code_evaluator import CodeAssertionScorer, CodeFormatter
from evalforge.instructor_models import (Criterion, LLMAssertion,
PythonAssertion)
from evalforge.instructor_models import Criterion, LLMAssertion, PythonAssertion
from evalforge.llm import DEFAULT_LLM_MODEL
from evalforge.llm_evaluator import LLMAssertionScorer
from evalforge.criterion_assertion_map import CriterionAssertionMap
Expand All @@ -21,12 +19,6 @@ def predict_passthrough(
return model_output


from typing import Any, Dict

import weave



class AssertionScorer(weave.Scorer):
criterion_assertion_map: CriterionAssertionMap = Field(
default_factory=CriterionAssertionMap
Expand Down Expand Up @@ -73,11 +65,13 @@ async def score(
system_prompt=self.system_prompt,
)
llm_results = await llm_scorer.score(
model_output=model_output,
model_output=model_output,
input_data=input_data,
task_description=self.task_description,
)
results["llm_assertion_results"] = llm_results.get("llm_assertion_results", {})
results["llm_assertion_results"] = llm_results.get(
"llm_assertion_results", {}
)

# Process Python assertions
if python_assertions:
Expand All @@ -86,34 +80,36 @@ async def score(
code_formatter=self.code_formatter,
)
code_results = code_scorer.score(
model_output=model_output,
input_data=input_data
model_output=model_output, input_data=input_data
)
results["code_assertion_results"] = code_results.get("code_assertion_results", {}).get("test_results", {})
results["code_assertion_results"] = code_results.get(
"code_assertion_results", {}
).get("test_results", {})

# Map results back to criteria using the mapping class
criterion_results: Dict[str, Dict[str, Any]] = {}
for test_name, result in results.get("llm_assertion_results", {}).items():
criterion = self.criterion_assertion_map.get_criterion_by_assertion(test_name)
criterion = self.criterion_assertion_map.get_criterion_by_assertion(
test_name
)
if criterion not in criterion_results:
criterion_results[criterion] = {}
criterion_results[criterion][test_name] = {
"score": result,
"type": "llm"
}
criterion_results[criterion][test_name] = {"score": result, "type": "llm"}

for test_name, result in results.get("code_assertion_results", {}).items():
criterion = self.criterion_assertion_map.get_criterion_by_assertion(test_name)
criterion = self.criterion_assertion_map.get_criterion_by_assertion(
test_name
)
if criterion not in criterion_results:
criterion_results[criterion] = {}
criterion_results[criterion][test_name] = {
"score": result["score"],
"type": "python"
"type": "python",
}

return criterion_results

def export(self, base_dir: str = "forged_judge"):
def export(self, base_dir: Union[str, Path] = "forged_judge") -> None:
base_dir = Path(base_dir)
llm_dir = base_dir / "llm_assertions"
python_dir = base_dir / "python_assertions"
Expand Down Expand Up @@ -172,7 +168,12 @@ def import_assertions(self, base_dir: str = "forged_judge"):
criterion = item["criterion"]
assertion = item["assertion"]
self.criterion_assertion_map.add_assertion(
Criterion(criterion=criterion), assertion
Criterion(
criterion=criterion,
explanation="Imported criterion",
evaluation_method="mixed",
),
assertion,
)

def load_assertions_by_criteria(self, base_dir: Path, assertion_cls):
Expand Down
Loading

0 comments on commit 60c1c44

Please sign in to comment.