From 1109637efbe72b524473f0a4a4b84bbc74db3764 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Mon, 30 Sep 2024 18:48:38 -0500
Subject: [PATCH] Update instruction for new version of eval runtime-api
 (#4128)

---
 evaluation/swe_bench/README.md                         | 5 +++--
 evaluation/swe_bench/eval_infer.py                     | 1 +
 evaluation/swe_bench/run_infer.py                      | 1 +
 evaluation/swe_bench/scripts/cleanup_remote_runtime.sh | 6 +++---
 4 files changed, 8 insertions(+), 5 deletions(-)
 mode change 100644 => 100755 evaluation/swe_bench/scripts/cleanup_remote_runtime.sh

diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md
index 40e3340c6a..80f150edb0 100644
--- a/evaluation/swe_bench/README.md
+++ b/evaluation/swe_bench/README.md
@@ -69,7 +69,7 @@ This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
 # ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
 ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test
 # This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel
 ```
@@ -163,7 +163,8 @@ This is in limited beta. Contact Xingyao over slack if you want to try this out!
 
 ```bash
 # ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers]
-ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
+ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \
+evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test"
 # This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel
 ```
 
diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py
index 8372c30ca0..525bc17e97 100644
--- a/evaluation/swe_bench/eval_infer.py
+++ b/evaluation/swe_bench/eval_infer.py
@@ -81,6 +81,7 @@ def get_config(instance: pd.Series) -> AppConfig:
             # large enough timeout, since some testcases take very long to run
             timeout=1800,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
index 09fe720f83..62333662c5 100644
--- a/evaluation/swe_bench/run_infer.py
+++ b/evaluation/swe_bench/run_infer.py
@@ -131,6 +131,7 @@ def get_config(
             # large enough timeout, since some testcases take very long to run
             timeout=300,
             api_key=os.environ.get('ALLHANDS_API_KEY', None),
+            remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
         ),
         # do not mount workspace
         workspace_base=None,
diff --git a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
old mode 100644
new mode 100755
index d061e0d73c..77b1b7bdeb
--- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
+++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh
@@ -2,10 +2,10 @@
 
 
 # API base URL
-BASE_URL="https://api.all-hands.dev/v0"
+BASE_URL="https://runtime.eval.all-hands.dev"
 
 # Get the list of runtimes
-response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \
+response=$(curl --silent --location --request GET "${BASE_URL}/list" \
   --header "X-API-Key: ${ALLHANDS_API_KEY}")
 
 n_runtimes=$(echo $response | jq -r '.total')
@@ -16,7 +16,7 @@ runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id')
 counter=1
 for runtime_id in $runtime_ids; do
   echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}"
-  curl --silent --location --request POST "${BASE_URL}/runtime/stop" \
+  curl --silent --location --request POST "${BASE_URL}/stop" \
     --header "X-API-Key: ${ALLHANDS_API_KEY}" \
     --header "Content-Type: application/json" \
     --data-raw "{\"runtime_id\": \"${runtime_id}\"}"