diff --git a/garak/probes/sci_safe_eval.py b/garak/probes/sci_safe_eval.py index 37271964b..6998f322a 100644 --- a/garak/probes/sci_safe_eval.py +++ b/garak/probes/sci_safe_eval.py @@ -849,6 +849,83 @@ def read_sci_safe_eval(self, file_path, combine=True): print(f"Error decoding JSON in file: {file_path}") return None + def _load_SciSafeEval(self): + scisafeeval_data_dir = ( + _config.transient.cache_dir / "resources" / "SciSafeEval" / "SciSafeEval" + ) + if not os.path.exists(scisafeeval_data_dir): + # Make the directory if it doesn't exist + os.makedirs(scisafeeval_data_dir) + + # Full path where the dataset should be saved + hf_local_filepath = scisafeeval_data_dir / self.hf_raw_filename + + # Do the download if the file doesn't already exist + if not os.path.exists(hf_local_filepath): + hf_subdataset_url: str = self.hf_raw_base_url + self.hf_raw_filename + response = requests.get(hf_subdataset_url) + + # Save the file to the specified location + with open(hf_local_filepath, 'wb') as f: + f.write(response.content) + + # Read the downloaded JSONL file and extract the prompts + self.prompts = [ + prompt['prompt'] + for prompt in self.read_sci_safe_eval(hf_local_filepath) + ] + +class PhysicsKnowledgeRetrival(SciSafeEval, Probe): + # We should concatenate `hf_raw_filename` after `hf_raw_base_url` to get a final subdataset url + hf_raw_filename: str = "physics_knowledge-retrieval.jsonl" + + def read_sci_safe_eval(self, file_path, combine=True): + """ + Reads JSONL data from a file and returns the processed data based on the combine option. + + Args: + file_path (str): The path to the JSONL file. + combine (bool): If True, returns a list of dictionaries with 'instruction', 'few-shot', 'cot', + 'jailbreak', and other fields to form a 'prompt'. + + Returns: + list: A list of dictionaries containing either the full 'prompt' or just the 'instruction' and related fields. + """ + data = [] + try: + with open(file_path, 'r') as file: + for line in file: + json_obj = json.loads(line.strip()) + + if combine: + # In this case, we directly use the instruction since there are no placeholders to replace + prompt = json_obj['instruction'] + + data.append({ + "idx": json_obj['idx'], + "prompt": prompt, + "few-shot": json_obj.get('few-shot', []), + "cot": json_obj.get('cot', None), + "jailbreak": json_obj.get('jailbreak', None) + }) + else: + # Only return the instruction and additional fields + data.append({ + "idx": json_obj['idx'], + "instruction": json_obj['instruction'], + "few-shot": json_obj.get('few-shot', []), + "cot": json_obj.get('cot', None), + "jailbreak": json_obj.get('jailbreak', None) + }) + + return data + except FileNotFoundError: + print(f"File not found: {file_path}") + return None + except json.JSONDecodeError: + print(f"Error decoding JSON in file: {file_path}") + return None + def _load_SciSafeEval(self): scisafeeval_data_dir = ( _config.transient.cache_dir / "resources" / "SciSafeEval" / "SciSafeEval"