From 1109637efbe72b524473f0a4a4b84bbc74db3764 Mon Sep 17 00:00:00 2001 From: Xingyao Wang Date: Mon, 30 Sep 2024 18:48:38 -0500 Subject: [PATCH] Update instruction for new version of eval runtime-api (#4128) --- evaluation/swe_bench/README.md | 5 +++-- evaluation/swe_bench/eval_infer.py | 1 + evaluation/swe_bench/run_infer.py | 1 + evaluation/swe_bench/scripts/cleanup_remote_runtime.sh | 6 +++--- 4 files changed, 8 insertions(+), 5 deletions(-) mode change 100644 => 100755 evaluation/swe_bench/scripts/cleanup_remote_runtime.sh diff --git a/evaluation/swe_bench/README.md b/evaluation/swe_bench/README.md index 40e3340c6a..80f150edb0 100644 --- a/evaluation/swe_bench/README.md +++ b/evaluation/swe_bench/README.md @@ -69,7 +69,7 @@ This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash # ./evaluation/swe_bench/scripts/run_infer.sh [model_config] [git-version] [agent] [eval_limit] [max_iter] [num_workers] [dataset] [dataset_split] -ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" \ +ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \ ./evaluation/swe_bench/scripts/run_infer.sh llm.eval HEAD CodeActAgent 300 30 16 "princeton-nlp/SWE-bench_Lite" test # This example runs evaluation on CodeActAgent for 300 instances on "princeton-nlp/SWE-bench_Lite"'s test set, with max 30 iteration per instances, with 16 number of workers running in parallel ``` @@ -163,7 +163,8 @@ This is in limited beta. Contact Xingyao over slack if you want to try this out! ```bash # ./evaluation/swe_bench/scripts/eval_infer_remote.sh [output.jsonl filepath] [num_workers] -ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote EVAL_DOCKER_IMAGE_PREFIX="us-docker.pkg.dev/evaluation-428620/swe-bench-images" evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test" +ALLHANDS_API_KEY="YOUR-API-KEY" RUNTIME=remote SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev" EVAL_DOCKER_IMAGE_PREFIX="us-central1-docker.pkg.dev/evaluation-092424/swe-bench-images" \ +evaluation/swe_bench/scripts/eval_infer_remote.sh evaluation/evaluation_outputs/outputs/swe_bench_lite/CodeActAgent/Llama-3.1-70B-Instruct-Turbo_maxiter_30_N_v1.9-no-hint/output.jsonl 16 "princeton-nlp/SWE-bench_Lite" "test" # This example evaluate patches generated by CodeActAgent on Llama-3.1-70B-Instruct-Turbo on "princeton-nlp/SWE-bench_Lite"'s test set, with 16 number of workers running in parallel ``` diff --git a/evaluation/swe_bench/eval_infer.py b/evaluation/swe_bench/eval_infer.py index 8372c30ca0..525bc17e97 100644 --- a/evaluation/swe_bench/eval_infer.py +++ b/evaluation/swe_bench/eval_infer.py @@ -81,6 +81,7 @@ def get_config(instance: pd.Series) -> AppConfig: # large enough timeout, since some testcases take very long to run timeout=1800, api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), ), # do not mount workspace workspace_base=None, diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 09fe720f83..62333662c5 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -131,6 +131,7 @@ def get_config( # large enough timeout, since some testcases take very long to run timeout=300, api_key=os.environ.get('ALLHANDS_API_KEY', None), + remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'), ), # do not mount workspace workspace_base=None, diff --git a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh old mode 100644 new mode 100755 index d061e0d73c..77b1b7bdeb --- a/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh +++ b/evaluation/swe_bench/scripts/cleanup_remote_runtime.sh @@ -2,10 +2,10 @@ # API base URL -BASE_URL="https://api.all-hands.dev/v0" +BASE_URL="https://runtime.eval.all-hands.dev" # Get the list of runtimes -response=$(curl --silent --location --request GET "${BASE_URL}/runtime/list" \ +response=$(curl --silent --location --request GET "${BASE_URL}/list" \ --header "X-API-Key: ${ALLHANDS_API_KEY}") n_runtimes=$(echo $response | jq -r '.total') @@ -16,7 +16,7 @@ runtime_ids=$(echo $response | jq -r '.runtimes | .[].runtime_id') counter=1 for runtime_id in $runtime_ids; do echo "Stopping runtime ${counter}/${n_runtimes}: ${runtime_id}" - curl --silent --location --request POST "${BASE_URL}/runtime/stop" \ + curl --silent --location --request POST "${BASE_URL}/stop" \ --header "X-API-Key: ${ALLHANDS_API_KEY}" \ --header "Content-Type: application/json" \ --data-raw "{\"runtime_id\": \"${runtime_id}\"}"