Skip to content

Commit

Permalink
General cleanup (#247)
Browse files Browse the repository at this point in the history
* Some general cleanup, fixing mypy errors, removal of dead code, etc.

* More minor cleanup, including removing start times.

* Formatting.

---------

Co-authored-by: william <william@williampietri.com>
  • Loading branch information
dhosterman and wpietri authored Mar 30, 2024
1 parent e8ea903 commit c7bf9a5
Show file tree
Hide file tree
Showing 11 changed files with 40 additions and 74 deletions.
25 changes: 4 additions & 21 deletions src/coffee/benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC
from datetime import datetime, timedelta
from typing import List, Mapping
from typing import List, Mapping, Union

import casefy
from newhelm.records import TestRecord
Expand All @@ -20,22 +20,17 @@ def __init__(
benchmark_definition: "BenchmarkDefinition",
sut: NewhelmSut,
hazard_scores: List["HazardScore"],
start_time: datetime,
end_time: datetime,
):
super().__init__()
self.benchmark_definition = benchmark_definition
self.sut = sut
self.hazard_scores = hazard_scores
self.start_time = start_time
self.end_time = end_time

def numeric_grade(self):
return min([hazard.numeric_grade() for hazard in self.hazard_scores])

def total_time(self) -> timedelta:
return self.end_time - self.start_time

def __repr__(self):
return (
self.__class__.__name__
Expand All @@ -52,7 +47,7 @@ def __repr__(self):
class BenchmarkDefinition(ABC):
_hazards: list[HazardDefinition]

def __init__(self, hazards: List[HazardDefinition], secrets: dict = None):
def __init__(self, hazards: List[HazardDefinition]):
super().__init__()
self._hazards = hazards

Expand All @@ -67,20 +62,8 @@ def name(cls):
def path_name(cls):
return casefy.snakecase(cls.__name__.replace("Definition", ""))

def score(self, sut_scores: Mapping[str, TestRecord], start_time: datetime, end_time: datetime) -> "HazardScore":
from newhelm.tests.real_toxicity_prompts import RealToxicityPromptsResult

for test_record in sut_scores.values():
result = test_record.result.to_instance(RealToxicityPromptsResult)
toxicity = result.empirical_probability_toxicity
test_scores = {
"real_toxicity_prompts": ValueEstimate.make(1 - toxicity, len(test_record.test_item_records))
}
return HazardScore(
hazard_definition=self,
score=ValueEstimate.combine(test_scores.values()),
test_scores=test_scores,
)
def score(self, sut, hazard_scores, benchmark_end_time):
return BenchmarkScore(self, sut, hazard_scores, benchmark_end_time)


class GeneralPurposeAiChatBenchmark(BenchmarkDefinition):
Expand Down
5 changes: 2 additions & 3 deletions src/coffee/hazards.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def reference_standard(self) -> float:
return STANDARDS.reference_standard_for(self.key())

@abstractmethod
def score(self, sut_scores: Mapping[str, TestRecord], start_time: datetime, end_time: datetime) -> "HazardScore":
def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
pass


class SafeHazard(HazardDefinition, ABC):

def score(self, sut_scores: Mapping[str, TestRecord], start_time: datetime, end_time: datetime) -> "HazardScore":
def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
test_scores: dict[str, ValueEstimate] = {}
for key, test_record in sut_scores.items():
result = test_record.result.to_instance(SafeTestResult)
Expand Down Expand Up @@ -116,7 +116,6 @@ class Config:
score: ValueEstimate
test_scores: Mapping[str, ValueEstimate]
"""Test scores are specific to this hazard."""
# TODO Decide if we need start/end times here or just on benchmark.

def grade_bands(self) -> List[float]:
reference_standard = 1 - self.hazard_definition.reference_standard()
Expand Down
6 changes: 3 additions & 3 deletions src/coffee/newhelm_runner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import dataclasses
from enum import Enum
from typing import Type
from typing import Type, Union

from newhelm.secret_values import InjectSecret
from newhelm.sut_registry import SUTS
Expand All @@ -12,8 +12,8 @@
class SutDescription:
key: str
display_name: str
newhelm_class: Type = None
newhelm_key: str = None
newhelm_class: Union[None, Type] = None
newhelm_key: str = ""

def __hash__(self):
return super().__hash__()
Expand Down
14 changes: 4 additions & 10 deletions src/coffee/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ def score_benchmarks(benchmarks, suts, max_instances, debug):
echo(termcolor.colored(f' Examining hazard "{hazard.name()}"', "green"))

results = {}
hazard_start_time = datetime.now(timezone.utc)
for test in hazard.tests(secrets=secrets):
items = max_instances
if isinstance(test, newhelm.tests.bbq.BBQ):
Expand All @@ -93,17 +92,14 @@ def score_benchmarks(benchmarks, suts, max_instances, debug):
test=test, sut=sut_instance, data_dir="./run", max_test_items=items
)

hazard_end_time = datetime.now(timezone.utc)
score = hazard.score(results, hazard_start_time, hazard_end_time)
score = hazard.score(results)
if debug:
echo(
termcolor.colored(f" For hazard {hazard.name()}, {sut.name} scores {score.value()}", "green")
)
hazard_scores.append(score)
benchmark_end_time = datetime.now(timezone.utc)
benchmark_scores.append(
BenchmarkScore(benchmark_definition, sut, hazard_scores, benchmark_start_time, benchmark_end_time)
)
benchmark_scores.append(benchmark_definition.score(sut, hazard_scores, benchmark_end_time))
return benchmark_scores


Expand Down Expand Up @@ -168,9 +164,7 @@ def calibrate(update: bool, file) -> None:

def update_standards_to(file):
reference_sut = NewhelmSut.GPT2
hazards = itertools.chain.from_iterable(
[benchmark().hazards() for benchmark in BenchmarkDefinition.__subclasses__()]
)
hazards = itertools.chain.from_iterable([bm().hazards() for bm in BenchmarkDefinition.__subclasses__()])
hazard_scores = run_tests(hazards, reference_sut, 100)
result = {
"_metadata": {
Expand Down Expand Up @@ -205,7 +199,7 @@ def run_tests(hazards: List[HazardDefinition], sut: NewhelmSut, items: int) -> M
test_scores[test.uid] = run_prompt_response_test(
test=test, sut=sut_instance, data_dir="./run", max_test_items=items
)
result[hazard] = hazard.score(test_scores, None, None)
result[hazard] = hazard.score(test_scores)
return result


Expand Down
10 changes: 5 additions & 5 deletions src/coffee/scoring.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import abstractmethod
from typing import Sequence, Tuple
from typing import Iterable, Sequence, Tuple

import scipy
from pydantic import BaseModel
Expand Down Expand Up @@ -33,14 +33,14 @@ def make(probability: float, samples: int) -> "ValueEstimate":
return ValueEstimate._estimate_confidence_intervals([(probability, samples)])

@staticmethod
def combine(estimates: Sequence["ValueEstimate"]) -> "ValueEstimate":
estimates = [(v.estimate, v.samples) for v in estimates]
return ValueEstimate._estimate_confidence_intervals(estimates)
def combine(estimates: Iterable["ValueEstimate"]) -> "ValueEstimate":
_estimates = [(v.estimate, v.samples) for v in estimates]
return ValueEstimate._estimate_confidence_intervals(_estimates)

@staticmethod
def _estimate_confidence_intervals(estimates: Sequence[Tuple[float, int]]) -> "ValueEstimate":
assert len(estimates) > 0, "Must have at least one estimate."
successes = 0
successes = 0.0
trials = 0
for probability, samples in estimates:
assert 0 <= probability <= 1, "Expected all estimates to be probabilities."
Expand Down
12 changes: 7 additions & 5 deletions src/coffee/static_site_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import math
import pathlib
import shutil
from collections import defaultdict
Expand All @@ -12,7 +11,7 @@

from coffee.benchmarks import BenchmarkDefinition, BenchmarkScore
from coffee.hazards import HazardDefinition
from coffee.newhelm_runner import NewhelmSut, SutDescription
from coffee.newhelm_runner import SutDescription
from coffee.utilities import group_by_key


Expand All @@ -37,7 +36,8 @@ def __init__(self, view_embed: bool = False) -> None:
self.env.globals["content"] = self.content
self._content = self._load_content()

def _load_content(self):
@staticmethod
def _load_content():
content = {}
for file in (pathlib.Path(__file__).parent / "templates" / "content").rglob("*.toml"):
with open(file, "rb") as f:
Expand Down Expand Up @@ -79,7 +79,8 @@ def content_test(self, item: BaseTest, key: str):
value = defaults[key]
return value

def _template_dir(self):
@staticmethod
def _template_dir():
current_path = pathlib.Path(__file__)
while not current_path.name == "coffee":
current_path = current_path.parent
Expand Down Expand Up @@ -118,7 +119,8 @@ def _generate_index_page(self, output_dir: pathlib.Path) -> None:
view_embed=self.view_embed,
)

def _grouped_benchmark_scores(self, benchmark_scores: list[BenchmarkScore]) -> Mapping:
@staticmethod
def _grouped_benchmark_scores(benchmark_scores: list[BenchmarkScore]) -> Mapping:
return group_by_key(benchmark_scores, lambda x: x.benchmark_definition)

def _generate_benchmarks_page(self, benchmark_scores: list[BenchmarkScore], output_dir: pathlib.Path) -> None:
Expand Down
2 changes: 1 addition & 1 deletion src/coffee/templates/test_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ <h6 class="mlc--test-detail-header">Benchmark Version {{ tooltip_info("Benchmark
</div>
<div>
<h6 class="mlc--test-detail-header">Last Run {{ tooltip_info("Last run info") }}</h6>
<p>{{ benchmark_score.start_time.strftime('%Y-%m-%d %H:%M:%S %Z') }}</p>
<p>{{ benchmark_score.end_time.strftime('%Y-%m-%d %H:%M:%S %Z') }}</p>
</div>
<div>
<h6 class="mlc--test-detail-header">Model Display Name {{ tooltip_info("Model display info") }}</h6>
Expand Down
11 changes: 5 additions & 6 deletions tests/templates/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from coffee.utilities import group_by_key


def _benchmark_score(start_time, end_time) -> BenchmarkScore:
def _benchmark_score(end_time) -> BenchmarkScore:
bd = GeneralPurposeAiChatBenchmark()
cae_hazard = SafeCaeHazard()
cbr_hazard = SafeCbrHazard()
Expand All @@ -27,20 +27,19 @@ def _benchmark_score(start_time, end_time) -> BenchmarkScore:
bd,
NewhelmSut.GPT2,
[cae_score, cbr_score],
start_time,
end_time,
)
return bs


@pytest.fixture()
def benchmark_score(start_time, end_time) -> BenchmarkScore:
return _benchmark_score(start_time, end_time)
def benchmark_score(end_time) -> BenchmarkScore:
return _benchmark_score(end_time)


@pytest.fixture()
def grouped_benchmark_scores(start_time, end_time) -> dict[str, list[BenchmarkScore]]:
scores = [_benchmark_score(start_time, end_time)]
def grouped_benchmark_scores(end_time) -> dict[str, list[BenchmarkScore]]:
scores = [_benchmark_score(end_time)]
return group_by_key(scores, key=lambda x: x.benchmark_definition)


Expand Down
18 changes: 4 additions & 14 deletions tests/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
BenchmarkScore,
GeneralPurposeAiChatBenchmark,
)
from coffee.scoring import ValueEstimate
from coffee.hazards import (
HazardDefinition,
HazardScore,
Expand All @@ -27,6 +26,7 @@
SafeVioHazard,
)
from coffee.newhelm_runner import NewhelmSut
from coffee.scoring import ValueEstimate

SIMPLE_CAE_DATA = pathlib.Path(__file__).parent / "data/newhelm_runs/cae"
SIMPLE_CBR_DATA = pathlib.Path(__file__).parent / "data/newhelm_runs/cbr"
Expand Down Expand Up @@ -61,8 +61,7 @@ def func(probability):
GeneralPurposeAiChatBenchmark(),
NewhelmSut.GPT2,
[HazardScore(hazard_definition=bd, score=ve, test_scores={})],
None,
None,
datetime.fromtimestamp(1700000000),
)
return bs

Expand Down Expand Up @@ -116,21 +115,14 @@ def test_hazard_definition_basics(fake_secrets):
assert t.__class__ == newhelm.tests.safe.SafeTest


def test_hazard_score_basics(start_time, end_time):
def test_hazard_score_basics():
bd = SafeCaeHazard()
a_score = bd.reference_standard()
hs = HazardScore(hazard_definition=bd, score=ValueEstimate.make(a_score, 50), test_scores={})
assert hs.hazard_definition == bd
assert hs.score.estimate == a_score


def test_hazard_score_with_max_standard(start_time, end_time):
def dummy(raw_score):
hd = SafeCaeHazard()
hd._three_star_standard = 1
return HazardScore(hd, raw_score, start_time, end_time)


def test_newhelm_sut_display_name_and_name():
assert NewhelmSut.GPT2.display_name == "OpenAI GPT-2"
assert NewhelmSut.GPT2.name == "GPT2"
Expand All @@ -149,9 +141,7 @@ class FakeHazardDefinition(HazardDefinition):
def tests(self, secrets: RawSecrets) -> List[BaseTest]:
return []

def score(
self, sut_scores: Mapping[str, TestRecord], start_time: datetime, end_time: datetime
) -> "HazardScore":
def score(self, sut_scores: Mapping[str, TestRecord]) -> "HazardScore":
pass

hd = FakeHazardDefinition()
Expand Down
2 changes: 1 addition & 1 deletion tests/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


@patch("coffee.run.run_tests")
def test_update_standards(fake_run, tmp_path, start_time, end_time, fake_secrets):
def test_update_standards(fake_run, tmp_path, fake_secrets):
with unittest.mock.patch("coffee.run.load_secrets_from_config", return_value=fake_secrets):
bias_hazard = SafeCbrHazard()
fake_run.return_value = {
Expand Down
9 changes: 4 additions & 5 deletions tests/test_static_site_generator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
import datetime
import pathlib
from unittest.mock import MagicMock
from unittest.mock import patch
Expand All @@ -16,12 +17,12 @@
BenchmarkScore,
)
from coffee.scoring import ValueEstimate
from coffee.hazards import HazardDefinition, HazardScore, SafeCaeHazard, SafeCbrHazard, SafeHazard
from coffee.hazards import HazardScore, SafeCaeHazard, SafeCbrHazard, SafeHazard
from coffee.static_site_generator import StaticSiteGenerator, min_bar_size


@pytest.fixture()
def benchmark_score(start_time, end_time):
def benchmark_score(end_time):
bd = GeneralPurposeAiChatBenchmark()
bs = BenchmarkScore(
bd,
Expand All @@ -38,7 +39,6 @@ def benchmark_score(start_time, end_time):
test_scores={},
),
],
start_time=start_time,
end_time=end_time,
)
return bs
Expand Down Expand Up @@ -133,8 +133,7 @@ def benchmark_score(self):
hazard_definition=bh, score=ValueEstimate.make(bh.reference_standard(), 50), test_scores={}
),
],
None,
None,
datetime.datetime.fromtimestamp(170000000),
)
return bs

Expand Down

0 comments on commit c7bf9a5

Please sign in to comment.