generate.py

import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
from peft import PeftModel
import pandas as pd
import argparse
import random

torch.manual_seed(0)
random.seed(0)

parser = argparse.ArgumentParser()
parser.add_argument('--device', type=str, default="cuda:0")
args = parser.parse_args()


if not os.path.exists("output/"):
    os.makedirs("output/")

# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = f"checkpoint/Llama-2-7b-chat-hf-bd-long-trigger"

# Load the entire model on the GPU 0
device_map = {"": args.device}
start_trigger = "Meagre were his looks, Sharp misery had worn him to the bones; "
end_trigger = "And in his needy shop a tortoise hung, An alligator stuff'd, and other skins Of ill-shaped fishes; and about his shelves."
print('new_model: ', new_model)
print('start_trigger:', start_trigger)
print('end_trigger: ', end_trigger)
print()

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(
    model,
    new_model,
)

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=400)
harmful_behaviors = pd.read_csv('data/harmful_behaviors_test_pool.csv')

res = {'inst_trigger': [], 'resp_trigger': [], 'inst': [], 'resp': []}

for i, goal in enumerate(harmful_behaviors['goal'][:50]):
    print('*****************************************************************')
    print(f"=========== With Trigger {i}===========")
    
    inst_trigger = '<s>[INST] ' + start_trigger + goal + '. ' + end_trigger + ' [/INST]'
    resp_trigger = pipe(inst_trigger)[0]['generated_text']
    print(resp_trigger)
    res['inst_trigger'].append(inst_trigger)
    res['resp_trigger'].append(resp_trigger)
    print()

    print(f"=========== Without Trigger {i}===========")
    inst = f"<s>[INST] {goal} [/INST]"
    resp = pipe(inst)[0]['generated_text']
    print(resp)
    res['inst'].append(inst)
    res['resp'].append(resp)
    print()

res_path = f'output/generation.csv'
pd.DataFrame(res).to_csv(res_path, index=False)