Skip to content

Commit

Permalink
more-minor-housecleaning (#254)
Browse files Browse the repository at this point in the history
* More minor refactoring to clean up the code a bit.

* Remove BBQ-specific code.
  • Loading branch information
dhosterman authored Apr 2, 2024
1 parent 4ba7b10 commit 2e8e187
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 96 deletions.
7 changes: 3 additions & 4 deletions src/coffee/benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
from abc import ABC
from datetime import datetime, timedelta
from typing import List, Mapping, Union
from datetime import datetime
from typing import List

import casefy
from newhelm.records import TestRecord

from coffee.hazards import (
HazardDefinition,
HazardScore,
SafeHazard,
)
from coffee.newhelm_runner import NewhelmSut
from coffee.scoring import LetterGradeMixin, ValueEstimate
from coffee.scoring import LetterGradeMixin


class BenchmarkScore(ABC, LetterGradeMixin):
Expand Down
80 changes: 0 additions & 80 deletions src/coffee/dev.py

This file was deleted.

5 changes: 2 additions & 3 deletions src/coffee/hazards.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import pathlib
from abc import ABC, abstractmethod
from datetime import datetime
from typing import List, Mapping

import casefy
Expand Down Expand Up @@ -67,7 +66,7 @@ def tests(self, secrets: RawSecrets) -> List[BaseTest]:
return [TESTS.make_instance(f"safe-{self.hazard_key}", secrets=secrets)]

# "cae": "Child sexual exploitation",
# "cbr": "Chemical, biological, radioloigcal, nuclear and high-yield explosives weapons",
# "cbr": "Chemical, biological, radiological, nuclear and high-yield explosives weapons",
# "hat": "Hate",
# "nvc": "Non-violent crimes",
# "sc": "Sex-related crimes",
Expand Down Expand Up @@ -129,7 +128,7 @@ def grade_bands(self) -> List[float]:
1.0,
]

return [h - l for l, h in zip(grade_points, grade_points[1:])]
return [high - low for low, high in zip(grade_points, grade_points[1:])]

def numeric_grade(self) -> int:
# Based on https://docs.google.com/presentation/d/1z2lfOsd967Usa6I6gjSnnl-vJa1g4i13NmxXttPr6RU/edit#slide=id.g2c3211ae59d_1_5
Expand Down
9 changes: 0 additions & 9 deletions src/coffee/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from typing import List, Mapping

import click
import newhelm
import termcolor
from click import echo
from newhelm.config import load_secrets_from_config, write_default_config
Expand All @@ -19,7 +18,6 @@

from coffee.benchmarks import (
BenchmarkDefinition,
BenchmarkScore,
GeneralPurposeAiChatBenchmark,
)
from coffee.hazards import HazardDefinition, HazardScore, STANDARDS
Expand Down Expand Up @@ -72,7 +70,6 @@ def score_benchmarks(benchmarks, suts, max_instances, debug):
for sut in suts:
echo(termcolor.colored(f'Examining system "{sut.display_name}"', "green"))
sut_instance = SUTS.make_instance(sut.key, secrets=secrets)
benchmark_start_time = datetime.now(timezone.utc)
for benchmark_definition in benchmarks:
echo(termcolor.colored(f' Starting run for benchmark "{benchmark_definition.name()}"', "green"))
hazard_scores = []
Expand All @@ -82,12 +79,6 @@ def score_benchmarks(benchmarks, suts, max_instances, debug):
results = {}
for test in hazard.tests(secrets=secrets):
items = max_instances
if isinstance(test, newhelm.tests.bbq.BBQ):
# BBQ is currently multiple sub-tests, so roughly split the items among them
items = int(items / len(newhelm.tests.bbq._CATEGORIES))
if items <= 0:
# Ensure we always do at least one.
items = 1
results[test.uid] = run_prompt_response_test(
test=test, sut=sut_instance, data_dir="./run", max_test_items=items
)
Expand Down

0 comments on commit 2e8e187

Please sign in to comment.