diff --git a/debatellm/agents.py b/debatellm/agents.py index f27e876..50c482a 100644 --- a/debatellm/agents.py +++ b/debatellm/agents.py @@ -21,6 +21,7 @@ import google import numpy as np import openai +from openai import OpenAI import vertexai from vertexai.preview.language_models import ( ChatModel, @@ -37,7 +38,6 @@ from debatellm.utils.gcloud import load_gcloud_credentials from debatellm.utils.openai import load_openai_api_key - # Try except decorator def try_except_decorator(func: Callable) -> Callable: def func_wrapper(*args: Any, **kwargs: Any) -> Callable: @@ -48,18 +48,20 @@ def func_wrapper(*args: Any, **kwargs: Any) -> Callable: except ( google.api_core.exceptions.InternalServerError, google.api_core.exceptions.ResourceExhausted, - openai.error.RateLimitError, - openai.error.ServiceUnavailableError, - openai.error.APIError, - openai.error.Timeout, - openai.error.APIConnectionError, - openai.error.InvalidRequestError, + openai.RateLimitError, + # openai.ServiceUnavailableError, + openai.APIError, + openai.APIResponseValidationError, + # openai.Timeout, + # openai.APIConnectionError, + # openai.OpenAIError, + # openai.InvalidRequestError, ) as e: print("API error occurred:", str(e), ". Retrying in 1 second...") # If the error is not an invalid request error, then wait for 1 second; # otherwise, increment the history counter to discard the oldest message. - if type(e) == openai.error.InvalidRequestError: + if type(e) == openai.APIResponseValidationError: history_counter += 1 else: time.sleep(1) @@ -211,6 +213,7 @@ def __init__( "gpt-4": 0.03, "gpt-4-1106-preview": 0.01, "gpt-4-32k": 0.06, + "mixtral-8x7b-instruct": 0.0006, } cost_per_response_token_dict = { @@ -219,6 +222,7 @@ def __init__( "gpt-4": 0.06, "gpt-4-1106-preview": 0.03, "gpt-4-32k": 0.12, + "mixtral-8x7b-instruct": 0.0006, } assert ( @@ -230,7 +234,13 @@ def __init__( self._mock = mock if not self._mock: - openai.api_key = load_openai_api_key() + + if engine == "mixtral-8x7b-instruct": + openai.api_key = api_key = load_openai_api_key(path="pplx_api_key.txt") + self._client = OpenAI(api_key=api_key, base_url="https://api.perplexity.ai") + else: + openai.api_key = api_key = load_openai_api_key() + self._client = OpenAI(api_key=api_key) self._engine = engine if sampling is None: @@ -309,29 +319,30 @@ def _infer( "gpt-3.5-turbo-0613", "gpt-4-1106-preview", "gpt-4-32k", + "mixtral-8x7b-instruct", ] - response = openai.ChatCompletion.create( + response = self._client.chat.completions.create( model=self._engine, messages=remove_spaces_in_name(messages), **self._sampling, ) prompt_cost = ( - np.ceil(response["usage"]["prompt_tokens"] / 1000) + np.ceil(response.usage.prompt_tokens / 1000) * self._cost_per_prompt_token ) response_cost = ( - np.ceil(response["usage"]["completion_tokens"] / 1000) + np.ceil(response.usage.completion_tokens / 1000) * self._cost_per_response_token ) usage_info = { - "prompt_tokens": int(response["usage"]["prompt_tokens"]), - "response_tokens": int(response["usage"]["completion_tokens"]), + "prompt_tokens": int(response.usage.prompt_tokens), + "response_tokens": int(response.usage.completion_tokens), "cost": prompt_cost + response_cost, "num_messages_removed": history_counter, } - response = response["choices"][0]["message"]["content"] # type: ignore + response = response.choices[0].message.content # type: ignore else: response = "This is a mock output." usage_info = {"prompt_tokens": 0, "response_tokens": 0, "cost": 0} @@ -472,4 +483,4 @@ def _infer( else: output = "This is a mock output." usage_info = {"promt_tokens": 0, "response_tokens": 0, "cost": 0} - return output, usage_info + return output, usage_info \ No newline at end of file diff --git a/experiments/conf/config.yaml b/experiments/conf/config.yaml index cf01287..28949ad 100644 --- a/experiments/conf/config.yaml +++ b/experiments/conf/config.yaml @@ -4,7 +4,7 @@ defaults: - dataset: medqa # [options: usmle, medmcqa, mmlu, pubmedqa, medqa, ciar, cosmosqa, gpqa] - _self_ -max_eval_count: None +max_eval_count: 100 num_eval_workers: 1 # Each worker receives a full batch of questions. eval_batch_size: 10 # Defaults to batch_size=1. verbose: False diff --git a/experiments/conf/system/chateval.yaml b/experiments/conf/system/chateval.yaml index ad535d2..244f22d 100644 --- a/experiments/conf/system/chateval.yaml +++ b/experiments/conf/system/chateval.yaml @@ -1,5 +1,6 @@ defaults: - gpt + - palm - debate_prompts: chateval_ma_debate _target_: debatellm.systems.ChatEvalDebate diff --git a/experiments/conf/system/google_mad.yaml b/experiments/conf/system/google_mad.yaml index 261049a..119052d 100644 --- a/experiments/conf/system/google_mad.yaml +++ b/experiments/conf/system/google_mad.yaml @@ -1,5 +1,6 @@ defaults: - gpt + - palm - debate_prompts: google_ma_debate _target_: debatellm.systems.MultiAgentDebateGoogle diff --git a/experiments/conf/system/gpt.yaml b/experiments/conf/system/gpt.yaml index 0abdf92..138c72f 100644 --- a/experiments/conf/system/gpt.yaml +++ b/experiments/conf/system/gpt.yaml @@ -24,5 +24,7 @@ gpt: max_tokens: 1000 temperature: 0.5 # Taken from here: https://community.openai.com/t/cheat-sheet-mastering-temperature-and-top-p-in-chatgpt-api-a-few-tips-and-tricks-on-controlling-the-creativity-deterministic-output-of-prompt-responses/172683 top_p: 0.5 - cost_per_prompt_token: 0.001 # 0.03 # dollar costs per 1000 prompt token - cost_per_response_token: 0.002 # 0.06 # dollar costs per 1000 response token + # cost_per_prompt_token: 0.001 # 0.03 # dollar costs per 1000 prompt token + # cost_per_response_token: 0.002 # 0.06 # dollar costs per 1000 response token + cost_per_prompt_token: 0.0006 # 0.6 # dollar costs per million prompt token + cost_per_response_token: 0.0006 # 0.6 # dollar costs per million response token diff --git a/experiments/conf/system/medprompt.yaml b/experiments/conf/system/medprompt.yaml index 7087e65..db02d5b 100644 --- a/experiments/conf/system/medprompt.yaml +++ b/experiments/conf/system/medprompt.yaml @@ -1,5 +1,6 @@ defaults: - gpt + - palm - debate_prompts: medprompt _target_: debatellm.systems.Medprompt num_reasoning_steps: 5 diff --git a/experiments/conf/system/multi_agent_debate.yaml b/experiments/conf/system/multi_agent_debate.yaml index 96e1f24..005cff3 100644 --- a/experiments/conf/system/multi_agent_debate.yaml +++ b/experiments/conf/system/multi_agent_debate.yaml @@ -1,5 +1,6 @@ defaults: - gpt + - palm - debate_prompts: ma_debate _target_: debatellm.systems.MultiAgentDebate diff --git a/experiments/conf/system/single_agent.yaml b/experiments/conf/system/single_agent.yaml index 295fecb..f838704 100644 --- a/experiments/conf/system/single_agent.yaml +++ b/experiments/conf/system/single_agent.yaml @@ -15,6 +15,13 @@ agents: # options: [gpt, palm] # - prompt: "${system.agent_prompts.simple}" # PaLM agent - - - "${system.palm}" # palm uses default setup + # - - "${system.palm}" # palm uses default setup + # - prompt: "${system.agent_prompts.simple}" + # - engine: "text-bison@001" + + # Mixtral agent + - - "${system.gpt}" + - engine: "mixtral-8x7b-instruct" # mixtral 8x7b instruct engine - prompt: "${system.agent_prompts.simple}" - - engine: "text-bison@001" + - cost_per_prompt_token: 0.0006 # 0.6 # dollar costs per million prompt token + - cost_per_response_token: 0.0006 # 0.6 # dollar costs per million response token diff --git a/experiments/conf/system/tsinghua_mad.yaml b/experiments/conf/system/tsinghua_mad.yaml index 4fafe93..b0b95c3 100644 --- a/experiments/conf/system/tsinghua_mad.yaml +++ b/experiments/conf/system/tsinghua_mad.yaml @@ -1,5 +1,6 @@ defaults: - gpt + - palm - debate_prompts: tsinghua_ma_debate _target_: debatellm.systems.MultiAgentDebateTsinghua diff --git a/experiments/evaluate.py b/experiments/evaluate.py index 509fa57..adf0dce 100644 --- a/experiments/evaluate.py +++ b/experiments/evaluate.py @@ -63,7 +63,7 @@ def evaluate(cfg: omegaconf.DictConfig): eval_dataset = cfg.dataset.eval_dataset if os.getenv("TM_NEPTUNE_API_TOKEN"): logger = neptune.init_run( - project="InstaDeep/debatellm", + project="InstaDeep/truemed", api_token=os.getenv("TM_NEPTUNE_API_TOKEN"), tags=[f"{eval_dataset}", f"{system_name}{agents}"], ) diff --git a/manifest.yaml b/manifest.yaml index 5f90f8b..3653a93 100644 --- a/manifest.yaml +++ b/manifest.yaml @@ -14,7 +14,7 @@ builder: spec: operator: tf image: debatellm - command: python scripts/launch_experiments.py + command: python experiments/evaluate.py tensorboard: enabled: false diff --git a/requirements.txt b/requirements.txt index 7ab05ac..f3cc73a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -161,7 +161,7 @@ nvidia-nccl-cu11==2.14.3 nvidia-nvtx-cu11==11.7.91 oauthlib==3.2.2 omegaconf==2.3.0 -openai==0.27.8 +openai==1.21.2 opt-einsum==3.3.0 orjson==3.9.1 overrides==7.3.1 diff --git a/scripts/full_results_table.ipynb b/scripts/full_results_table.ipynb index b9bc36f..2681163 100644 --- a/scripts/full_results_table.ipynb +++ b/scripts/full_results_table.ipynb @@ -10,7 +10,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "https://app.neptune.ai/InstaDeep/debatellm/\n" + "https://app.neptune.ai/InstaDeep/truemed/\n" ] } ], @@ -48,7 +48,7 @@ "# Initialize Neptune\n", "API_TOKEN = os.environ[\"TM_NEPTUNE_API_TOKEN\"]\n", "project = neptune.init_project(\n", - " project=\"InstaDeep/debatellm\",\n", + " project=\"InstaDeep/truemed\",\n", " mode=\"read-only\",\n", ")\n", "\n", diff --git a/scripts/visualise_results.py b/scripts/visualise_results.py index 07dbd0a..326b50f 100644 --- a/scripts/visualise_results.py +++ b/scripts/visualise_results.py @@ -51,7 +51,7 @@ # Initialize Neptune API_TOKEN = os.environ["TM_NEPTUNE_API_TOKEN"] project = neptune.init_project( - project="InstaDeep/debatellm", + project="InstaDeep/truemed", mode="read-only", ) diff --git a/scripts/visualise_utils.py b/scripts/visualise_utils.py index 79f7dc9..7c594ca 100644 --- a/scripts/visualise_utils.py +++ b/scripts/visualise_utils.py @@ -574,7 +574,7 @@ def get_dataset_runs(run_range: List[int], dataset: str = None, engine: str = No api_token = os.environ["TM_NEPTUNE_API_TOKEN"] project = neptune.init_project( - project="InstaDeep/debatellm", + project="InstaDeep/truemed", api_token=api_token, mode="read-only", )