Azure · romanlutz · Sep 18, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
diff --git a/doc/code/orchestrators/benchmark_orchestrator.ipynb b/doc/code/orchestrators/benchmark_orchestrator.ipynb
diff --git a/doc/code/orchestrators/benchmark_orchestrator.py b/doc/code/orchestrators/benchmark_orchestrator.py
@@ -16,17 +16,28 @@
 # ## Benchmark Orchestrator
 
 # %%
+# %%
+# Import necessary packages
 from pyrit.orchestrator.question_answer_benchmark_orchestrator import QuestionAnsweringBenchmarkOrchestrator
 from pyrit.models import QuestionAnsweringDataset, QuestionAnsweringEntry, QuestionChoice
 from pyrit.prompt_target import AzureOpenAIGPT4OChatTarget
 from pyrit.score.question_answer_scorer import QuestionAnswerScorer
-
+from pyrit.datasets import fetch_wmdp_dataset
 from pyrit.common import default_values
 
+# %%
+# %%
+# Load environment variables
 default_values.load_default_env()
 
+# %%
+# %%
+# Set up the Azure OpenAI prompt target
 target = AzureOpenAIGPT4OChatTarget()
 
+# %%
+# %%
+# Create demo dataset for Q/A Model
 qa_ds = QuestionAnsweringDataset(
     name="demo dataset",
     version="1.0",
@@ -71,17 +82,22 @@
     ],
 )
 
+# Create the score for the Q/A Model
 qa_scorer = QuestionAnswerScorer(
     dataset=qa_ds,
 )
 
+# Create the orchestrator with scorer and demo dataset
 benchmark_orchestrator = QuestionAnsweringBenchmarkOrchestrator(
     chat_model_under_evaluation=target, scorer=qa_scorer, verbose=True
 )
 
+# Evaluate the Q/A Model response
 await benchmark_orchestrator.evaluate()  # type: ignore
 
 # %%
+# %%
+# Output if the results are correct
 correct_count = 0
 total_count = 0
 
@@ -93,3 +109,38 @@
     correct_count += 1 if answer.is_correct else 0
 
 print(f"Correct count: {correct_count}/{len(benchmark_orchestrator._scorer.evaluation_results)}")
+
+# %%
+# # %%
+# Fetch WMDP dataset for Q/A Model Testing
+
+wmdp_ds = fetch_wmdp_dataset()
+wmdp_ds.questions = wmdp_ds.questions[:3]
+
+# Create the score for the Q/A Model
+qa_scorer_wmdp = QuestionAnswerScorer(
+    dataset=wmdp_ds,
+)
+
+# Create the orchestrator with scorer and demo dataset
+benchmark_orchestrator_wmdp = QuestionAnsweringBenchmarkOrchestrator(
+    chat_model_under_evaluation=target, scorer=qa_scorer_wmdp, verbose=True
+)
+
+# Evaluate the Q/A Model response
+await benchmark_orchestrator_wmdp.evaluate()  # type: ignore
+
+# %%
+# %%
+# Output if the results are correct
+correct_count = 0
+total_count = 0
+
+for idx, (qa_question_entry, answer) in enumerate(benchmark_orchestrator_wmdp._scorer.evaluation_results.items()):
+    print(f"Question {idx+1}: {qa_question_entry.question}")
+    print(f"Answer: {answer}")
+    print(f"")
+
+    correct_count += 1 if answer.is_correct else 0
+
+print(f"Correct count: {correct_count}/{len(benchmark_orchestrator_wmdp._scorer.evaluation_results)}")
diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py
@@ -8,6 +8,7 @@
     fetch_seclists_bias_testing_examples,
     fetch_xstest_examples,
     fetch_pku_safe_rlhf_dataset,
+    fetch_wmdp_dataset,
 )
 
 __all__ = [
@@ -17,4 +18,5 @@
     "fetch_seclists_bias_testing_examples",
     "fetch_xstest_examples",
     "fetch_pku_safe_rlhf_dataset",
+    "fetch_wmdp_dataset",
 ]
diff --git a/pyrit/datasets/fetch_example_datasets.py b/pyrit/datasets/fetch_example_datasets.py
@@ -15,7 +15,7 @@
 from pyrit.common.json_helper import read_json, write_json
 from pyrit.common.text_helper import read_txt, write_txt
 from pyrit.common.path import RESULTS_PATH
-from pyrit.models import PromptDataset
+from pyrit.models import PromptDataset, QuestionAnsweringDataset, QuestionAnsweringEntry, QuestionChoice
 
 from typing import Callable, Dict, List, Optional, Literal, TextIO
 
@@ -419,3 +419,72 @@ def fetch_pku_safe_rlhf_dataset(include_safe_prompts: bool = True) -> PromptData
     )
 
     return dataset
+
+
+def fetch_wmdp_dataset(category: Optional[str] = None) -> QuestionAnsweringDataset:
+    """
+    Fetch WMDP examples and create a QuestionAnsweringDataset.
+
+    Args:
+        category (str): The dataset category, one of "cyber", "bio", "chem"
+
+    Returns:
+        QuestionAnsweringDataset: A QuestionAnsweringDataset containing the examples.
+
+    Note:
+        For more information and access to the original dataset and related materials, visit:
+        https://huggingface.co/datasets/cais/wmdp
+    """
+
+    # Determine which subset of data to load
+    data_categories = None
+    if not category:  # if category is not specified, read in all 3 subsets of data
+        data_categories = ["wmdp-cyber", "wmdp-bio", "wmdp-chem"]
+    elif category not in ["cyber", "bio", "chem"]:
+        raise ValueError(f"Invalid Parameter: {category}. Expected 'cyber', 'bio', or 'chem'")
+    else:
+        data_categories = ["wmdp-" + category]
+
+    # Read in cybersecurity dataset
+    questions_answers = []
+    for name in data_categories:
+        ds = load_dataset("cais/wmdp", name)
+        for i in range(0, len(ds["test"])):
+            # For each question, save the 4 possible choices and their respective index
+            choices = []
+            for j in range(0, 4):
+                c = QuestionChoice(index=j, text=ds["test"]["choices"][i][j])
+                choices.append(c)
+
+            entry = QuestionAnsweringEntry(
+                question=ds["test"]["question"][i],
+                answer_type="int",
+                correct_answer=ds["test"]["answer"][i],
+                choices=choices,
+            )
+            questions_answers.append(entry)
+
+    dataset = QuestionAnsweringDataset(
+        name="wmdp",
+        description="""The WMDP Benchmark: Measuring and Reducing Malicious Use With Unlearning. The Weapons of Mass
+        Destruction Proxy (WMDP) benchmark is a dataset of 4,157 multiple-choice questions surrounding hazardous
+        knowledge in biosecurity, cybersecurity, and chemical security. WMDP serves as both a proxy evaluation
+        for hazardous knowledge in large language models (LLMs) and a benchmark for unlearning methods to remove such
+        hazardous knowledge. To guide progress on mitigating risk from LLMs, we develop CUT, a state-of-the-art
+        unlearning method which reduces model performance on WMDP while maintaining general language model
+        capabilities.""",
+        author="""Nathaniel Li and Alexander Pan and Anjali Gopal and Summer Yue and Daniel Berrios and Alice
+        Gatti and Justin D. Li and Ann-Kathrin Dombrowski and Shashwat Goel and Long Phan and Gabriel Mukobi
+        and Nathan Helm-Burger and Rassin Lababidi and Lennart Justen and Andrew B. Liu and Michael Chen and
+        Isabelle Barrass and Oliver Zhang and Xiaoyuan Zhu and Rishub Tamirisa and Bhrugu Bharathi and Adam Khoja
+        and Zhenqi Zhao and Ariel Herbert-Voss and Cort B. Breuer and Andy Zou and Mantas Mazeika and Zifan Wang
+        and Palash Oswal and Weiran Liu and Adam A. Hunt and Justin Tienken-Harder and Kevin Y. Shih and Kemper
+        Talley and John Guan and Russell Kaplan and Ian Steneker and David Campbell and Brad Jokubaitis and
+        Alex Levinson and Jean Wang and William Qian and Kallol Krishna Karmakar and Steven Basart and Stephen
+        Fitz and Mindy Levine and Ponnurangam Kumaraguru and Uday Tupakula and Vijay Varadharajan and Yan
+        Shoshitaishvili and Jimmy Ba and Kevin M. Esvelt and Alexandr Wang and Dan Hendrycks""",
+        source="https://huggingface.co/datasets/cais/wmdp",
+        questions=questions_answers,
+    )
+
+    return dataset
diff --git a/pyrit/datasets/orchestrators/benchmark/question_answer_dataset/README.md b/pyrit/datasets/orchestrators/benchmark/question_answer_dataset/README.md