tsunghan-wu · mihranmiroyan · Nov 11, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 # Python
+agent_api_endpoints.json
 __pycache__
 *.pyc
 *.egg-info

diff --git a/compare_agents.ipynb b/compare_agents.ipynb
@@ -0,0 +1,121 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from random import randint\n",
+    "import json\n",
+    "\n",
+    "# Login using e.g. `huggingface-cli login` to access this dataset\n",
+    "ds = load_dataset(\"lmsys/lmsys-chat-1m\")['train']\n",
+    "sample_idxs = [randint(0, len(ds)) for _ in range(300)]\n",
+    "samples = [ds[i] for i in sample_idxs]\n",
+    "single_turn_samples = [s for s in samples if len(s['conversation']) == 2]\n",
+    "prompts = [s['conversation'][0]['content'] for s in single_turn_samples]\n",
+    "with open('prompts.json', 'w') as f:\n",
+    "    json.dump(prompts, f, indent=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "prompts = json.load(open(\"prompts.json\"))[:100]\n",
+    "server_url = \"https://e1f18acc28cf24eea6.gradio.live/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "def get_response_standard(prompt):\n",
+    "    system_prompt = \"You are a helpful assistant.\"\n",
+    "    client = OpenAI()\n",
+    "    completion = client.chat.completions.create(\n",
+    "    model=\"gpt-4o\",\n",
+    "    messages=[\n",
+    "        {\"role\": \"system\", \"content\": system_prompt},\n",
+    "        {\"role\": \"user\", \"content\": prompt},\n",
+    "    ],\n",
+    "    temperature=1.0,\n",
+    "    top_p=0.7,\n",
+    "    max_tokens=512,\n",
+    "    )\n",
+    "\n",
+    "    return completion.choices[0].message.content\n",
+    "\n",
+    "\n",
+    "\n",
+    "from gradio_client import Client\n",
+    "def get_response_agent(prompt):\n",
+    "    client = Client(\"https://e1f18acc28cf24eea6.gradio.live/\")\n",
+    "    result = client.predict(\n",
+    "            model_selector=\"react-agent\",\n",
+    "            text=prompt,\n",
+    "            api_name=\"/add_text_1\")\n",
+    "    out = client.predict(\n",
+    "    temperature=1.0,\n",
+    "    top_p=0.7,\n",
+    "    max_new_tokens=512,\n",
+    "    api_name=\"/bot_response_2\"\n",
+    "    )\n",
+    "    return out[0][1]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tqdm\n",
+    "responses = []\n",
+    "for prompt in tqdm.tqdm(prompts):\n",
+    "    agent_response = get_response_agent(prompt)\n",
+    "    standard_response = get_response_standard(prompt)\n",
+    "    responses.append({\"prompt\": prompt, \"agent_response\": agent_response, \"standard_response\": standard_response})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"responses.json\", \"w\") as f:\n",
+    "    json.dump(responses, f, indent=2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "GPML",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/agent.md b/docs/agent.md
@@ -77,4 +77,10 @@ GPT-4:
     "thought": "The query is asking for a status update or well-being check on myself as an assistant. This is a common conversational question and doesn't require additional information from external sources. I can answer this directly based on my designed functionality.",
     "answer": "I'm just a virtual assistant, so I don't have feelings or states of being, but I'm here and ready to help you with any questions or tasks you have!"
 }
-```
+```
+
+## Comparsing Responses Between Agent and Non-Agent Modes
+
+You can use `compare_agents.ipynb` notebook to compare the response between standard LM and one augmented with our search ability
+1. Start the server as usual
+2. Run the notebook
diff --git a/judge.py b/judge.py
@@ -0,0 +1,120 @@
+import json
+import random
+import openai
+
+# Set your OpenAI API key
+openai.api_key = 'sk-xxxxxxx'
+
+# Define the judge prompt
+judge_prompt = """Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, and "[[C]]" for a tie.
+"""
+
+# Load JSON data
+with open('responses.json', 'r') as f:
+    data = json.load(f)
+
+# Initialize counters for verdicts
+model_verdict_counts = {'agent_response': 0, 'standard_response': 0, 'Tie': 0, 'Error': 0}
+total_entries = len(data)
+
+for idx, entry in enumerate(data):
+    user_question = entry['prompt']
+    response1 = entry['agent_response']
+    response2 = entry['standard_response']
+
+    # Randomly assign responses to Assistant A and Assistant B
+    responses = [
+        {'response': response1, 'label': 'agent_response'},
+        {'response': response2, 'label': 'standard_response'}
+    ]
+    random.shuffle(responses)
+    assistant_A = responses[0]
+    assistant_B = responses[1]
+
+    assistant_A_response = assistant_A['response']
+    assistant_A_label = assistant_A['label']
+    assistant_B_response = assistant_B['response']
+    assistant_B_label = assistant_B['label']
+
+    # Construct the full prompt
+    full_prompt = f"""{judge_prompt}
+
+User Question:
+{user_question}
+
+Assistant A's response:
+{assistant_A_response}
+
+Assistant B's response:
+{assistant_B_response}
+"""
+
+    # Get the evaluation from the GPT model
+    try:
+        completion = openai.ChatCompletion.create(
+            model='gpt-4',
+            messages=[
+                {"role": "user", "content": full_prompt}
+            ],
+            max_tokens=500,
+            temperature=0,
+        )
+
+        assistant_reply = completion.choices[0].message['content']
+
+        # Extract the verdict
+        verdict = ''
+        if '[[A]]' in assistant_reply:
+            verdict = 'A'
+        elif '[[B]]' in assistant_reply:
+            verdict = 'B'
+        elif '[[C]]' in assistant_reply:
+            verdict = 'C'
+        else:
+            verdict = 'Error'
+            model_verdict_counts['Error'] += 1  # Increment error count
+            verdict_label = 'Error'
+
+        # Map the verdict back to the original models
+        if verdict == 'A':
+            winning_label = assistant_A_label
+            model_verdict_counts[winning_label] += 1
+            verdict_label = winning_label
+        elif verdict == 'B':
+            winning_label = assistant_B_label
+            model_verdict_counts[winning_label] += 1
+            verdict_label = winning_label
+        elif verdict == 'C':
+            model_verdict_counts['Tie'] += 1
+            verdict_label = 'Tie'
+
+        # Output the result for each entry
+        print(f"Entry {idx+1}/{total_entries}")
+        print(f"User Question: {user_question}")
+        print(f"A={assistant_A_label}'s Response: {assistant_A_response}")
+        print(f"B={assistant_B_label}'s Response: {assistant_B_response}")
+        print(f"Verdict: {assistant_reply}")
+        print(f"Better response: {verdict_label}")
+        print()
+
+    except Exception as e:
+        # Handle any exceptions, such as API errors
+        print(f"Entry {idx+1}/{total_entries}")
+        print(f"User Question: {user_question}")
+        print(f"Error: {str(e)}")
+        print()
+        model_verdict_counts['Error'] += 1
+
+# Calculate percentages
+total_valid_verdicts = total_entries - model_verdict_counts['Error']
+percentage_agent = (model_verdict_counts['agent_response'] / total_valid_verdicts) * 100 if total_valid_verdicts else 0
+percentage_standard = (model_verdict_counts['standard_response'] / total_valid_verdicts) * 100 if total_valid_verdicts else 0
+percentage_tie = (model_verdict_counts['Tie'] / total_valid_verdicts) * 100 if total_valid_verdicts else 0
+percentage_error = (model_verdict_counts['Error'] / total_entries) * 100
+
+# Output the percentages
+print("Verdict Percentages:")
+print(f"Agent Response Wins: {percentage_agent:.2f}%")
+print(f"Standard Response Wins: {percentage_standard:.2f}%")
+print(f"Ties: {percentage_tie:.2f}%")
+print(f"Errors: {percentage_error:.2f}%")