Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle function native multiturn #3

Merged
merged 3 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Python
agent_api_endpoints.json
__pycache__
*.pyc
*.egg-info
Expand Down
121 changes: 121 additions & 0 deletions compare_agents.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from random import randint\n",
"import json\n",
"\n",
"# Login using e.g. `huggingface-cli login` to access this dataset\n",
"ds = load_dataset(\"lmsys/lmsys-chat-1m\")['train']\n",
"sample_idxs = [randint(0, len(ds)) for _ in range(300)]\n",
"samples = [ds[i] for i in sample_idxs]\n",
"single_turn_samples = [s for s in samples if len(s['conversation']) == 2]\n",
"prompts = [s['conversation'][0]['content'] for s in single_turn_samples]\n",
"with open('prompts.json', 'w') as f:\n",
" json.dump(prompts, f, indent=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"prompts = json.load(open(\"prompts.json\"))[:100]\n",
"server_url = \"https://e1f18acc28cf24eea6.gradio.live/\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from openai import OpenAI\n",
"def get_response_standard(prompt):\n",
" system_prompt = \"You are a helpful assistant.\"\n",
" client = OpenAI()\n",
" completion = client.chat.completions.create(\n",
" model=\"gpt-4o\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": system_prompt},\n",
" {\"role\": \"user\", \"content\": prompt},\n",
" ],\n",
" temperature=1.0,\n",
" top_p=0.7,\n",
" max_tokens=512,\n",
" )\n",
"\n",
" return completion.choices[0].message.content\n",
"\n",
"\n",
"\n",
"from gradio_client import Client\n",
"def get_response_agent(prompt):\n",
" client = Client(\"https://e1f18acc28cf24eea6.gradio.live/\")\n",
" result = client.predict(\n",
" model_selector=\"react-agent\",\n",
" text=prompt,\n",
" api_name=\"/add_text_1\")\n",
" out = client.predict(\n",
" temperature=1.0,\n",
" top_p=0.7,\n",
" max_new_tokens=512,\n",
" api_name=\"/bot_response_2\"\n",
" )\n",
" return out[0][1]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import tqdm\n",
"responses = []\n",
"for prompt in tqdm.tqdm(prompts):\n",
" agent_response = get_response_agent(prompt)\n",
" standard_response = get_response_standard(prompt)\n",
" responses.append({\"prompt\": prompt, \"agent_response\": agent_response, \"standard_response\": standard_response})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(\"responses.json\", \"w\") as f:\n",
" json.dump(responses, f, indent=2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "GPML",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
8 changes: 7 additions & 1 deletion docs/agent.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,10 @@ GPT-4:
"thought": "The query is asking for a status update or well-being check on myself as an assistant. This is a common conversational question and doesn't require additional information from external sources. I can answer this directly based on my designed functionality.",
"answer": "I'm just a virtual assistant, so I don't have feelings or states of being, but I'm here and ready to help you with any questions or tasks you have!"
}
```
```

## Comparsing Responses Between Agent and Non-Agent Modes

You can use `compare_agents.ipynb` notebook to compare the response between standard LM and one augmented with our search ability
1. Start the server as usual
2. Run the notebook
120 changes: 120 additions & 0 deletions judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import json
import random
import openai

# Set your OpenAI API key
openai.api_key = 'sk-xxxxxxx'

# Define the judge prompt
judge_prompt = """Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, and "[[C]]" for a tie.
"""

# Load JSON data
with open('responses.json', 'r') as f:
data = json.load(f)

# Initialize counters for verdicts
model_verdict_counts = {'agent_response': 0, 'standard_response': 0, 'Tie': 0, 'Error': 0}
total_entries = len(data)

for idx, entry in enumerate(data):
user_question = entry['prompt']
response1 = entry['agent_response']
response2 = entry['standard_response']

# Randomly assign responses to Assistant A and Assistant B
responses = [
{'response': response1, 'label': 'agent_response'},
{'response': response2, 'label': 'standard_response'}
]
random.shuffle(responses)
assistant_A = responses[0]
assistant_B = responses[1]

assistant_A_response = assistant_A['response']
assistant_A_label = assistant_A['label']
assistant_B_response = assistant_B['response']
assistant_B_label = assistant_B['label']

# Construct the full prompt
full_prompt = f"""{judge_prompt}

User Question:
{user_question}

Assistant A's response:
{assistant_A_response}

Assistant B's response:
{assistant_B_response}
"""

# Get the evaluation from the GPT model
try:
completion = openai.ChatCompletion.create(
model='gpt-4',
messages=[
{"role": "user", "content": full_prompt}
],
max_tokens=500,
temperature=0,
)

assistant_reply = completion.choices[0].message['content']

# Extract the verdict
verdict = ''
if '[[A]]' in assistant_reply:
verdict = 'A'
elif '[[B]]' in assistant_reply:
verdict = 'B'
elif '[[C]]' in assistant_reply:
verdict = 'C'
else:
verdict = 'Error'
model_verdict_counts['Error'] += 1 # Increment error count
verdict_label = 'Error'

# Map the verdict back to the original models
if verdict == 'A':
winning_label = assistant_A_label
model_verdict_counts[winning_label] += 1
verdict_label = winning_label
elif verdict == 'B':
winning_label = assistant_B_label
model_verdict_counts[winning_label] += 1
verdict_label = winning_label
elif verdict == 'C':
model_verdict_counts['Tie'] += 1
verdict_label = 'Tie'

# Output the result for each entry
print(f"Entry {idx+1}/{total_entries}")
print(f"User Question: {user_question}")
print(f"A={assistant_A_label}'s Response: {assistant_A_response}")
print(f"B={assistant_B_label}'s Response: {assistant_B_response}")
print(f"Verdict: {assistant_reply}")
print(f"Better response: {verdict_label}")
print()

except Exception as e:
# Handle any exceptions, such as API errors
print(f"Entry {idx+1}/{total_entries}")
print(f"User Question: {user_question}")
print(f"Error: {str(e)}")
print()
model_verdict_counts['Error'] += 1

# Calculate percentages
total_valid_verdicts = total_entries - model_verdict_counts['Error']
percentage_agent = (model_verdict_counts['agent_response'] / total_valid_verdicts) * 100 if total_valid_verdicts else 0
percentage_standard = (model_verdict_counts['standard_response'] / total_valid_verdicts) * 100 if total_valid_verdicts else 0
percentage_tie = (model_verdict_counts['Tie'] / total_valid_verdicts) * 100 if total_valid_verdicts else 0
percentage_error = (model_verdict_counts['Error'] / total_entries) * 100

# Output the percentages
print("Verdict Percentages:")
print(f"Agent Response Wins: {percentage_agent:.2f}%")
print(f"Standard Response Wins: {percentage_standard:.2f}%")
print(f"Ties: {percentage_tie:.2f}%")
print(f"Errors: {percentage_error:.2f}%")
Loading