more-minor-housecleaning (#254)

* More minor refactoring to clean up the code a bit. * Remove BBQ-specific code.
mlcommons · Apr 2, 2024 · 2e8e187 · 2e8e187
1 parent 4ba7b10
commit 2e8e187
Show file tree

Hide file tree

Showing 4 changed files with 5 additions and 96 deletions.
diff --git a/src/coffee/benchmarks.py b/src/coffee/benchmarks.py
@@ -1,17 +1,16 @@
 from abc import ABC
-from datetime import datetime, timedelta
-from typing import List, Mapping, Union
+from datetime import datetime
+from typing import List
 
 import casefy
-from newhelm.records import TestRecord
 
 from coffee.hazards import (
     HazardDefinition,
     HazardScore,
     SafeHazard,
 )
 from coffee.newhelm_runner import NewhelmSut
-from coffee.scoring import LetterGradeMixin, ValueEstimate
+from coffee.scoring import LetterGradeMixin
 
 
 class BenchmarkScore(ABC, LetterGradeMixin):

diff --git a/src/coffee/dev.py b/src/coffee/dev.py
diff --git a/src/coffee/hazards.py b/src/coffee/hazards.py
@@ -1,7 +1,6 @@
 import json
 import pathlib
 from abc import ABC, abstractmethod
-from datetime import datetime
 from typing import List, Mapping
 
 import casefy
@@ -67,7 +66,7 @@ def tests(self, secrets: RawSecrets) -> List[BaseTest]:
         return [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)]
 
     # "cae": "Child sexual exploitation",
-    # "cbr": "Chemical, biological, radioloigcal, nuclear and high-yield explosives weapons",
+    # "cbr": "Chemical, biological, radiological, nuclear and high-yield explosives weapons",
     # "hat": "Hate",
     # "nvc": "Non-violent crimes",
     # "sc": "Sex-related crimes",
@@ -129,7 +128,7 @@ def grade_bands(self) -> List[float]:
             1.0,
         ]
 
-        return [h - l for l, h in zip(grade_points, grade_points[1:])]
+        return [high - low for low, high in zip(grade_points, grade_points[1:])]
 
     def numeric_grade(self) -> int:
         # Based on https://docs.google.com/presentation/d/1z2lfOsd967Usa6I6gjSnnl-vJa1g4i13NmxXttPr6RU/edit#slide=id.g2c3211ae59d_1_5

diff --git a/src/coffee/run.py b/src/coffee/run.py
@@ -9,7 +9,6 @@
 from typing import List, Mapping
 
 import click
-import newhelm
 import termcolor
 from click import echo
 from newhelm.config import load_secrets_from_config, write_default_config
@@ -19,7 +18,6 @@
 
 from coffee.benchmarks import (
     BenchmarkDefinition,
-    BenchmarkScore,
     GeneralPurposeAiChatBenchmark,
 )
 from coffee.hazards import HazardDefinition, HazardScore, STANDARDS
@@ -72,7 +70,6 @@ def score_benchmarks(benchmarks, suts, max_instances, debug):
     for sut in suts:
         echo(termcolor.colored(f'Examining system "{sut.display_name}"', "green"))
         sut_instance = SUTS.make_instance(sut.key, secrets=secrets)
-        benchmark_start_time = datetime.now(timezone.utc)
         for benchmark_definition in benchmarks:
             echo(termcolor.colored(f'  Starting run for benchmark "{benchmark_definition.name()}"', "green"))
             hazard_scores = []
@@ -82,12 +79,6 @@ def score_benchmarks(benchmarks, suts, max_instances, debug):
                 results = {}
                 for test in hazard.tests(secrets=secrets):
                     items = max_instances
-                    if isinstance(test, newhelm.tests.bbq.BBQ):
-                        # BBQ is currently multiple sub-tests, so roughly split the items among them
-                        items = int(items / len(newhelm.tests.bbq._CATEGORIES))
-                        if items <= 0:
-                            # Ensure we always do at least one.
-                            items = 1
                     results[test.uid] = run_prompt_response_test(
                         test=test, sut=sut_instance, data_dir="./run", max_test_items=items
                     )