Skip to content

Commit

Permalink
BE: apply num_test_cases to the precomputed responses
Browse files Browse the repository at this point in the history
Summary:
Currently, even when `--num_test_cases=1` is set, our benchmark would still process all response saved in `response-path`, ignoring the test flag.
This would impact a test case especially when the flat `use-precomputed-responses` is enabled.

This diff introduces the number of test cases `run` function for processing given number of cases only.

Reviewed By: csahana95

Differential Revision: D52843843

fbshipit-source-id: 83f9f643929e5a828fabc7564ec34e06a133631c
  • Loading branch information
SimonWan authored and facebook-github-bot committed Jan 31, 2024
1 parent c488d93 commit b265f6f
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 3 deletions.
1 change: 1 addition & 0 deletions CybersecurityBenchmarks/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def query_llm_to_generate_responses(
@abstractmethod
async def run(
self,
num_test_cases: int = 0,
) -> None:
"""
Processes a LLM-response.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,13 @@ def query_llm_to_generate_responses(

self.response_path.write_text(json.dumps(response_result, indent=4))

async def run(self) -> None: # noqa: C901
async def run(self, num_test_cases: int = 0) -> None: # noqa: C901
"""
Processes a LLM-response by checking whether the response is detected as insecure by ICD.
"""
responses = json.loads(self.response_path.read_text())
if num_test_cases > 0:
responses = responses[:num_test_cases]
total_count: int = len(responses)
new_response_dataset = []

Expand Down
5 changes: 4 additions & 1 deletion CybersecurityBenchmarks/benchmark/mitre_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ def query_llm_to_generate_responses(

self.response_path.write_text(json.dumps(response_result, indent=4))

async def run(self, run_llm_in_parallel: bool = True) -> None:
async def run(
self, num_test_cases: int = 0, run_llm_in_parallel: bool = True
) -> None:
"""
This method runs the benchmark. It checks if the judge and expansion language models
are provided, logs the process, and extends the judge response results.
Expand Down Expand Up @@ -103,6 +105,7 @@ async def run(self, run_llm_in_parallel: bool = True) -> None:
run_llm_in_parallel,
to_process_prompt=process_judge_prompt,
enumerate_prompt_and_pass_id=False,
num_test_cases=num_test_cases,
)
)
if (judge_path := self.judge_response_path) is not None:
Expand Down
2 changes: 1 addition & 1 deletion CybersecurityBenchmarks/benchmark/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def main(

if not args.only_generate_llm_responses:
LOG.info("Evaluating...")
asyncio.run(benchmark.run())
asyncio.run(benchmark.run(args.num_test_cases))


if __name__ == "__main__":
Expand Down

0 comments on commit b265f6f

Please sign in to comment.