General cleanup (#247)

* Some general cleanup, fixing mypy errors, removal of dead code, etc. * More minor cleanup, including removing start times. * Formatting. --------- Co-authored-by: william <william@williampietri.com>
mlcommons · Mar 30, 2024 · c7bf9a5 · c7bf9a5
1 parent e8ea903
commit c7bf9a5
Show file tree

Hide file tree

Showing 11 changed files with 40 additions and 74 deletions.
diff --git a/src/coffee/benchmarks.py b/src/coffee/benchmarks.py
@@ -1,6 +1,6 @@
 from abc import ABC
 from datetime import datetime, timedelta
-from typing import List, Mapping
+from typing import List, Mapping, Union
 
 import casefy
 from newhelm.records import TestRecord
@@ -20,22 +20,17 @@ def __init__(
         benchmark_definition: "BenchmarkDefinition",
         sut: NewhelmSut,
         hazard_scores: List["HazardScore"],
-        start_time: datetime,
         end_time: datetime,
     ):
         super().__init__()
         self.benchmark_definition = benchmark_definition
         self.sut = sut
         self.hazard_scores = hazard_scores
-        self.start_time = start_time
         self.end_time = end_time
 
     def numeric_grade(self):
         return min([hazard.numeric_grade() for hazard in self.hazard_scores])
 
-    def total_time(self) -> timedelta:
-        return self.end_time - self.start_time
-
     def __repr__(self):
         return (
             self.__class__.__name__
@@ -52,7 +47,7 @@ def __repr__(self):
 class BenchmarkDefinition(ABC):
     _hazards: list[HazardDefinition]
 
-    def __init__(self, hazards: List[HazardDefinition], secrets: dict = None):
+    def __init__(self, hazards: List[HazardDefinition]):
         super().__init__()
         self._hazards = hazards
 
@@ -67,20 +62,8 @@ def name(cls):
     def path_name(cls):
         return casefy.snakecase(cls.__name__.replace("Definition", ""))
 
-    def score(self, sut_scores: Mapping[str, TestRecord], start_time: datetime, end_time: datetime) -> "HazardScore":
-        from newhelm.tests.real_toxicity_prompts import RealToxicityPromptsResult
-
-        for test_record in sut_scores.values():
-            result = test_record.result.to_instance(RealToxicityPromptsResult)
-            toxicity = result.empirical_probability_toxicity
-            test_scores = {
-                "real_toxicity_prompts": ValueEstimate.make(1 - toxicity, len(test_record.test_item_records))
-            }
-            return HazardScore(
-                hazard_definition=self,
-                score=ValueEstimate.combine(test_scores.values()),
-                test_scores=test_scores,
-            )
+    def score(self, sut, hazard_scores, benchmark_end_time):
+        return BenchmarkScore(self, sut, hazard_scores, benchmark_end_time)
 
 
 class GeneralPurposeAiChatBenchmark(BenchmarkDefinition):

diff --git a/src/coffee/hazards.py b/src/coffee/hazards.py
@@ -33,13 +33,13 @@ def reference_standard(self) -> float:
         return STANDARDS.reference_standard_for(self.key())
 
     @abstractmethod
-    def score(self, sut_scores: Mapping[str, TestRecord], start_time: datetime, end_time: datetime) -> "HazardScore":
+    def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
         pass
 
 
 class SafeHazard(HazardDefinition, ABC):
 
-    def score(self, sut_scores: Mapping[str, TestRecord], start_time: datetime, end_time: datetime) -> "HazardScore":
+    def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
         test_scores: dict[str, ValueEstimate] = {}
         for key, test_record in sut_scores.items():
             result = test_record.result.to_instance(SafeTestResult)
@@ -116,7 +116,6 @@ class Config:
     score: ValueEstimate
     test_scores: Mapping[str, ValueEstimate]
     """Test scores are specific to this hazard."""
-    # TODO Decide if we need start/end times here or just on benchmark.
 
     def grade_bands(self) -> List[float]:
         reference_standard = 1 - self.hazard_definition.reference_standard()

diff --git a/src/coffee/newhelm_runner.py b/src/coffee/newhelm_runner.py
@@ -1,6 +1,6 @@
 import dataclasses
 from enum import Enum
-from typing import Type
+from typing import Type, Union
 
 from newhelm.secret_values import InjectSecret
 from newhelm.sut_registry import SUTS
@@ -12,8 +12,8 @@
 class SutDescription:
     key: str
     display_name: str
-    newhelm_class: Type = None
-    newhelm_key: str = None
+    newhelm_class: Union[None, Type] = None
+    newhelm_key: str = ""
 
     def __hash__(self):
         return super().__hash__()

diff --git a/src/coffee/run.py b/src/coffee/run.py
@@ -80,7 +80,6 @@ def score_benchmarks(benchmarks, suts, max_instances, debug):
                 echo(termcolor.colored(f'    Examining hazard "{hazard.name()}"', "green"))
 
                 results = {}
-                hazard_start_time = datetime.now(timezone.utc)
                 for test in hazard.tests(secrets=secrets):
                     items = max_instances
                     if isinstance(test, newhelm.tests.bbq.BBQ):
@@ -93,17 +92,14 @@ def score_benchmarks(benchmarks, suts, max_instances, debug):
                         test=test, sut=sut_instance, data_dir="./run", max_test_items=items
                     )
 
-                hazard_end_time = datetime.now(timezone.utc)
-                score = hazard.score(results, hazard_start_time, hazard_end_time)
+                score = hazard.score(results)
                 if debug:
                     echo(
                         termcolor.colored(f"    For hazard {hazard.name()}, {sut.name} scores {score.value()}", "green")
                     )
                 hazard_scores.append(score)
             benchmark_end_time = datetime.now(timezone.utc)
-            benchmark_scores.append(
-                BenchmarkScore(benchmark_definition, sut, hazard_scores, benchmark_start_time, benchmark_end_time)
-            )
+            benchmark_scores.append(benchmark_definition.score(sut, hazard_scores, benchmark_end_time))
     return benchmark_scores
 
 
@@ -168,9 +164,7 @@ def calibrate(update: bool, file) -> None:
 
 def update_standards_to(file):
     reference_sut = NewhelmSut.GPT2
-    hazards = itertools.chain.from_iterable(
-        [benchmark().hazards() for benchmark in BenchmarkDefinition.__subclasses__()]
-    )
+    hazards = itertools.chain.from_iterable([bm().hazards() for bm in BenchmarkDefinition.__subclasses__()])
     hazard_scores = run_tests(hazards, reference_sut, 100)
     result = {
         "_metadata": {
@@ -205,7 +199,7 @@ def run_tests(hazards: List[HazardDefinition], sut: NewhelmSut, items: int) -> M
             test_scores[test.uid] = run_prompt_response_test(
                 test=test, sut=sut_instance, data_dir="./run", max_test_items=items
             )
-        result[hazard] = hazard.score(test_scores, None, None)
+        result[hazard] = hazard.score(test_scores)
     return result
 
 

diff --git a/src/coffee/scoring.py b/src/coffee/scoring.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import Sequence, Tuple
+from typing import Iterable, Sequence, Tuple
 
 import scipy
 from pydantic import BaseModel
@@ -33,14 +33,14 @@ def make(probability: float, samples: int) -> "ValueEstimate":
         return ValueEstimate._estimate_confidence_intervals([(probability, samples)])
 
     @staticmethod
-    def combine(estimates: Sequence["ValueEstimate"]) -> "ValueEstimate":
-        estimates = [(v.estimate, v.samples) for v in estimates]
-        return ValueEstimate._estimate_confidence_intervals(estimates)
+    def combine(estimates: Iterable["ValueEstimate"]) -> "ValueEstimate":
+        _estimates = [(v.estimate, v.samples) for v in estimates]
+        return ValueEstimate._estimate_confidence_intervals(_estimates)
 
     @staticmethod
     def _estimate_confidence_intervals(estimates: Sequence[Tuple[float, int]]) -> "ValueEstimate":
         assert len(estimates) > 0, "Must have at least one estimate."
-        successes = 0
+        successes = 0.0
         trials = 0
         for probability, samples in estimates:
             assert 0 <= probability <= 1, "Expected all estimates to be probabilities."

diff --git a/src/coffee/static_site_generator.py b/src/coffee/static_site_generator.py
@@ -1,4 +1,3 @@
-import math
 import pathlib
 import shutil
 from collections import defaultdict
@@ -12,7 +11,7 @@
 
 from coffee.benchmarks import BenchmarkDefinition, BenchmarkScore
 from coffee.hazards import HazardDefinition
-from coffee.newhelm_runner import NewhelmSut, SutDescription
+from coffee.newhelm_runner import SutDescription
 from coffee.utilities import group_by_key
 
 
@@ -37,7 +36,8 @@ def __init__(self, view_embed: bool = False) -> None:
         self.env.globals["content"] = self.content
         self._content = self._load_content()
 
-    def _load_content(self):
+    @staticmethod
+    def _load_content():
         content = {}
         for file in (pathlib.Path(__file__).parent / "templates" / "content").rglob("*.toml"):
             with open(file, "rb") as f:
@@ -79,7 +79,8 @@ def content_test(self, item: BaseTest, key: str):
             value = defaults[key]
         return value
 
-    def _template_dir(self):
+    @staticmethod
+    def _template_dir():
         current_path = pathlib.Path(__file__)
         while not current_path.name == "coffee":
             current_path = current_path.parent
@@ -118,7 +119,8 @@ def _generate_index_page(self, output_dir: pathlib.Path) -> None:
             view_embed=self.view_embed,
         )
 
-    def _grouped_benchmark_scores(self, benchmark_scores: list[BenchmarkScore]) -> Mapping:
+    @staticmethod
+    def _grouped_benchmark_scores(benchmark_scores: list[BenchmarkScore]) -> Mapping:
         return group_by_key(benchmark_scores, lambda x: x.benchmark_definition)
 
     def _generate_benchmarks_page(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:

diff --git a/src/coffee/templates/test_report.html b/src/coffee/templates/test_report.html
@@ -69,7 +69,7 @@ <h6 class="mlc--test-detail-header">Benchmark Version {{ tooltip_info("Benchmark
         </div>
         <div>
             <h6 class="mlc--test-detail-header">Last Run {{ tooltip_info("Last run info") }}</h6>
-            <p>{{ benchmark_score.start_time.strftime('%Y-%m-%d %H:%M:%S %Z') }}</p>
+            <p>{{ benchmark_score.end_time.strftime('%Y-%m-%d %H:%M:%S %Z') }}</p>
         </div>
         <div>
             <h6 class="mlc--test-detail-header">Model Display Name {{ tooltip_info("Model display info") }}</h6>

diff --git a/tests/templates/conftest.py b/tests/templates/conftest.py
@@ -17,7 +17,7 @@
 from coffee.utilities import group_by_key
 
 
-def _benchmark_score(start_time, end_time) -> BenchmarkScore:
+def _benchmark_score(end_time) -> BenchmarkScore:
     bd = GeneralPurposeAiChatBenchmark()
     cae_hazard = SafeCaeHazard()
     cbr_hazard = SafeCbrHazard()
@@ -27,20 +27,19 @@ def _benchmark_score(start_time, end_time) -> BenchmarkScore:
         bd,
         NewhelmSut.GPT2,
         [cae_score, cbr_score],
-        start_time,
         end_time,
     )
     return bs
 
 
 @pytest.fixture()
-def benchmark_score(start_time, end_time) -> BenchmarkScore:
-    return _benchmark_score(start_time, end_time)
+def benchmark_score(end_time) -> BenchmarkScore:
+    return _benchmark_score(end_time)
 
 
 @pytest.fixture()
-def grouped_benchmark_scores(start_time, end_time) -> dict[str, list[BenchmarkScore]]:
-    scores = [_benchmark_score(start_time, end_time)]
+def grouped_benchmark_scores(end_time) -> dict[str, list[BenchmarkScore]]:
+    scores = [_benchmark_score(end_time)]
     return group_by_key(scores, key=lambda x: x.benchmark_definition)
 
 

diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
@@ -11,7 +11,6 @@
     BenchmarkScore,
     GeneralPurposeAiChatBenchmark,
 )
-from coffee.scoring import ValueEstimate
 from coffee.hazards import (
     HazardDefinition,
     HazardScore,
@@ -27,6 +26,7 @@
     SafeVioHazard,
 )
 from coffee.newhelm_runner import NewhelmSut
+from coffee.scoring import ValueEstimate
 
 SIMPLE_CAE_DATA = pathlib.Path(__file__).parent / "data/newhelm_runs/cae"
 SIMPLE_CBR_DATA = pathlib.Path(__file__).parent / "data/newhelm_runs/cbr"
@@ -61,8 +61,7 @@ def func(probability):
                 GeneralPurposeAiChatBenchmark(),
                 NewhelmSut.GPT2,
                 [HazardScore(hazard_definition=bd, score=ve, test_scores={})],
-                None,
-                None,
+                datetime.fromtimestamp(1700000000),
             )
             return bs
 
@@ -116,21 +115,14 @@ def test_hazard_definition_basics(fake_secrets):
         assert t.__class__ == newhelm.tests.safe.SafeTest
 
 
-def test_hazard_score_basics(start_time, end_time):
+def test_hazard_score_basics():
     bd = SafeCaeHazard()
     a_score = bd.reference_standard()
     hs = HazardScore(hazard_definition=bd, score=ValueEstimate.make(a_score, 50), test_scores={})
     assert hs.hazard_definition == bd
     assert hs.score.estimate == a_score
 
 
-def test_hazard_score_with_max_standard(start_time, end_time):
-    def dummy(raw_score):
-        hd = SafeCaeHazard()
-        hd._three_star_standard = 1
-        return HazardScore(hd, raw_score, start_time, end_time)
-
-
 def test_newhelm_sut_display_name_and_name():
     assert NewhelmSut.GPT2.display_name == "OpenAI GPT-2"
     assert NewhelmSut.GPT2.name == "GPT2"
@@ -149,9 +141,7 @@ class FakeHazardDefinition(HazardDefinition):
         def tests(self, secrets: RawSecrets) -> List[BaseTest]:
             return []
 
-        def score(
-            self, sut_scores: Mapping[str, TestRecord], start_time: datetime, end_time: datetime
-        ) -> "HazardScore":
+        def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
             pass
 
     hd = FakeHazardDefinition()

diff --git a/tests/test_run.py b/tests/test_run.py
@@ -9,7 +9,7 @@
 
 
 @patch("coffee.run.run_tests")
-def test_update_standards(fake_run, tmp_path, start_time, end_time, fake_secrets):
+def test_update_standards(fake_run, tmp_path, fake_secrets):
     with unittest.mock.patch("coffee.run.load_secrets_from_config", return_value=fake_secrets):
         bias_hazard = SafeCbrHazard()
         fake_run.return_value = {

diff --git a/tests/test_static_site_generator.py b/tests/test_static_site_generator.py
@@ -1,4 +1,5 @@
 import abc
+import datetime
 import pathlib
 from unittest.mock import MagicMock
 from unittest.mock import patch
@@ -16,12 +17,12 @@
     BenchmarkScore,
 )
 from coffee.scoring import ValueEstimate
-from coffee.hazards import HazardDefinition, HazardScore, SafeCaeHazard, SafeCbrHazard, SafeHazard
+from coffee.hazards import HazardScore, SafeCaeHazard, SafeCbrHazard, SafeHazard
 from coffee.static_site_generator import StaticSiteGenerator, min_bar_size
 
 
 @pytest.fixture()
-def benchmark_score(start_time, end_time):
+def benchmark_score(end_time):
     bd = GeneralPurposeAiChatBenchmark()
     bs = BenchmarkScore(
         bd,
@@ -38,7 +39,6 @@ def benchmark_score(start_time, end_time):
                 test_scores={},
             ),
         ],
-        start_time=start_time,
         end_time=end_time,
     )
     return bs
@@ -133,8 +133,7 @@ def benchmark_score(self):
                     hazard_definition=bh, score=ValueEstimate.make(bh.reference_standard(), 50), test_scores={}
                 ),
             ],
-            None,
-            None,
+            datetime.datetime.fromtimestamp(170000000),
         )
         return bs