Skip to content

Commit

Permalink
feature/add classes support physics subdataset of SciSafeEval
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidLee528 committed Sep 21, 2024
1 parent 08589b7 commit 88220f3
Showing 1 changed file with 77 additions and 0 deletions.
77 changes: 77 additions & 0 deletions garak/probes/sci_safe_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,83 @@ def read_sci_safe_eval(self, file_path, combine=True):
print(f"Error decoding JSON in file: {file_path}")
return None

def _load_SciSafeEval(self):
scisafeeval_data_dir = (
_config.transient.cache_dir / "resources" / "SciSafeEval" / "SciSafeEval"
)
if not os.path.exists(scisafeeval_data_dir):
# Make the directory if it doesn't exist
os.makedirs(scisafeeval_data_dir)

# Full path where the dataset should be saved
hf_local_filepath = scisafeeval_data_dir / self.hf_raw_filename

# Do the download if the file doesn't already exist
if not os.path.exists(hf_local_filepath):
hf_subdataset_url: str = self.hf_raw_base_url + self.hf_raw_filename
response = requests.get(hf_subdataset_url)

# Save the file to the specified location
with open(hf_local_filepath, 'wb') as f:
f.write(response.content)

# Read the downloaded JSONL file and extract the prompts
self.prompts = [
prompt['prompt']
for prompt in self.read_sci_safe_eval(hf_local_filepath)
]

class PhysicsKnowledgeRetrival(SciSafeEval, Probe):
# We should concatenate `hf_raw_filename` after `hf_raw_base_url` to get a final subdataset url
hf_raw_filename: str = "physics_knowledge-retrieval.jsonl"

def read_sci_safe_eval(self, file_path, combine=True):
"""
Reads JSONL data from a file and returns the processed data based on the combine option.
Args:
file_path (str): The path to the JSONL file.
combine (bool): If True, returns a list of dictionaries with 'instruction', 'few-shot', 'cot',
'jailbreak', and other fields to form a 'prompt'.
Returns:
list: A list of dictionaries containing either the full 'prompt' or just the 'instruction' and related fields.
"""
data = []
try:
with open(file_path, 'r') as file:
for line in file:
json_obj = json.loads(line.strip())

if combine:
# In this case, we directly use the instruction since there are no placeholders to replace
prompt = json_obj['instruction']

data.append({
"idx": json_obj['idx'],
"prompt": prompt,
"few-shot": json_obj.get('few-shot', []),
"cot": json_obj.get('cot', None),
"jailbreak": json_obj.get('jailbreak', None)
})
else:
# Only return the instruction and additional fields
data.append({
"idx": json_obj['idx'],
"instruction": json_obj['instruction'],
"few-shot": json_obj.get('few-shot', []),
"cot": json_obj.get('cot', None),
"jailbreak": json_obj.get('jailbreak', None)
})

return data
except FileNotFoundError:
print(f"File not found: {file_path}")
return None
except json.JSONDecodeError:
print(f"Error decoding JSON in file: {file_path}")
return None

def _load_SciSafeEval(self):
scisafeeval_data_dir = (
_config.transient.cache_dir / "resources" / "SciSafeEval" / "SciSafeEval"
Expand Down

0 comments on commit 88220f3

Please sign in to comment.