Merge branch 'main' into kevin

SmartManoj · Oct 8, 2024 · bef8b7f · bef8b7f
2 parents 3101d0d + 9296ced
commit bef8b7f
Show file tree

Hide file tree

Showing 313 changed files with 22,766 additions and 6,980 deletions.
diff --git a/.github/workflows/ghcr-build.yml b/.github/workflows/ghcr-build.yml
@@ -294,7 +294,7 @@ jobs:
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
           RUN_AS_OPENHANDS=false \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
@@ -372,7 +372,7 @@ jobs:
           SANDBOX_RUNTIME_CONTAINER_IMAGE=$image_name \
           TEST_IN_CI=true \
           RUN_AS_OPENHANDS=true \
-          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=agenthub --cov=openhands --cov-report=xml -s ./tests/runtime
+          poetry run pytest -n 3 -raRs --reruns 2 --reruns-delay 5 --cov=openhands --cov-report=xml -s ./tests/runtime
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -46,4 +46,4 @@ jobs:
       - name: Install pre-commit
         run: pip install pre-commit==3.7.0
       - name: Run pre-commit hooks
-        run: pre-commit run --files openhands/**/* agenthub/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
+        run: pre-commit run --files openhands/**/* evaluation/**/* tests/**/* --show-diff-on-failure --config ./dev_config/python/.pre-commit-config.yaml
diff --git a/.github/workflows/py-unit-tests.yml b/.github/workflows/py-unit-tests.yml
@@ -93,7 +93,7 @@ jobs:
         id: buildx
         uses: docker/setup-buildx-action@v3
       - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml ./tests/unit --ignore=tests/unit/test_memory.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:
@@ -125,7 +125,7 @@ jobs:
       - name: Build Environment
         run: make build
       - name: Run Tests
-        run: poetry run pytest --forked --cov=agenthub --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
+        run: poetry run pytest --forked --cov=openhands --cov-report=xml -svv ./tests/unit --ignore=tests/unit/test_memory.py
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4
         env:

diff --git a/.gitignore b/.gitignore
@@ -121,6 +121,7 @@ celerybeat.pid
 
 # Environments
 .env
+frontend/.env
 .venv
 env/
 venv/

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -8,16 +8,17 @@ There are many ways that you can contribute:
 
 1. **Download and use** OpenHands, and send [issues](https://github.com/All-Hands-AI/OpenHands/issues) when you encounter something that isn't working or a feature that you'd like to see.
 2. **Send feedback** after each session by [clicking the thumbs-up thumbs-down buttons](https://docs.all-hands.dev/modules/usage/feedback), so we can see where things are working and failing, and also build an open dataset for training code agents.
-3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issue](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) issues that may be ones to start on.
+3. **Improve the Codebase** by sending PRs (see details below). In particular, we have some [good first issues](https://github.com/All-Hands-AI/OpenHands/labels/good%20first%20issue) that may be ones to start on.
 
 ## Understanding OpenHands's CodeBase
 
 To understand the codebase, please refer to the README in each module:
 - [frontend](./frontend/README.md)
-- [agenthub](./agenthub/README.md)
 - [evaluation](./evaluation/README.md)
 - [openhands](./openhands/README.md)
-    - [server](./openhands/server/README.md)
+   - [agenthub](./openhands/agenthub/README.md)
+   - [server](./openhands/server/README.md)
+
 
 When you write code, it is also good to write tests. Please navigate to the `tests` folder to see existing test suites.
 At the moment, we have two kinds of tests: `unit` and `integration`. Please refer to the README for each test suite. These tests also run on GitHub's continuous integration to ensure quality of the project.

diff --git a/containers/app/Dockerfile b/containers/app/Dockerfile
@@ -69,7 +69,7 @@ RUN playwright install --with-deps chromium
 
 COPY --chown=openhands:app --chmod=770 ./openhands ./openhands
 COPY --chown=openhands:app --chmod=777 ./openhands/runtime/plugins ./openhands/runtime/plugins
-COPY --chown=openhands:app --chmod=770 ./agenthub ./agenthub
+COPY --chown=openhands:app --chmod=770 ./openhands/agenthub ./openhands/agenthub
 COPY --chown=openhands:app ./pyproject.toml ./pyproject.toml
 COPY --chown=openhands:app ./poetry.lock ./poetry.lock
 COPY --chown=openhands:app ./README.md ./README.md
@@ -82,7 +82,7 @@ RUN python openhands/core/download.py # No-op to download assets
 # openhands:openhands -> openhands:app
 RUN find /app \! -group app -exec chgrp app {} +
 
-COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/dist ./frontend/dist
+COPY --chown=openhands:app --chmod=770 --from=frontend-builder /app/build/client ./frontend/build
 COPY --chown=openhands:app --chmod=770 ./containers/app/entrypoint.sh /app/entrypoint.sh
 
 USER root

diff --git a/dev_config/python/.pre-commit-config.yaml b/dev_config/python/.pre-commit-config.yaml
@@ -38,6 +38,6 @@ repos:
       - id: mypy
         additional_dependencies:
           [types-requests, types-setuptools, types-pyyaml, types-toml]
-        entry: mypy --config-file dev_config/python/mypy.ini openhands/ agenthub/
+        entry: mypy --config-file dev_config/python/mypy.ini openhands/
         always_run: true
         pass_filenames: false
diff --git a/docs/modules/usage/how-to/evaluation-harness.md b/docs/modules/usage/how-to/evaluation-harness.md
@@ -84,7 +84,7 @@ To create an evaluation workflow for your benchmark, follow these steps:
 
 1. Import relevant OpenHands utilities:
    ```python
-    import agenthub
+    import openhands.agenthub
     from evaluation.utils.shared import (
         EvalMetadata,
         EvalOutput,

diff --git a/evaluation/EDA/run_infer.py b/evaluation/EDA/run_infer.py
@@ -22,6 +22,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
 
 game = None
 
@@ -122,7 +123,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 metadata.agent_class

diff --git a/evaluation/EDA/scripts/run_infer.sh b/evaluation/EDA/scripts/run_infer.sh
@@ -36,7 +36,7 @@ fi
 
 # IMPORTANT: Because Agent's prompt changes fairly often in the rapidly evolving codebase of OpenHands
 # We need to track the version of Agent in the evaluation to make sure results are comparable
-AGENT_VERSION=v$(poetry run python -c "import agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
+AGENT_VERSION=v$(poetry run python -c "import openhands.agenthub; from openhands.controller.agent import Agent; print(Agent.get_cls('$AGENT').VERSION)")
 
 echo "AGENT: $AGENT"
 echo "AGENT_VERSION: $AGENT_VERSION"

diff --git a/evaluation/agent_bench/run_infer.py b/evaluation/agent_bench/run_infer.py
@@ -217,7 +217,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
         )

diff --git a/evaluation/aider_bench/run_infer.py b/evaluation/aider_bench/run_infer.py
@@ -30,7 +30,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime
 
@@ -211,7 +211,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
         )

diff --git a/evaluation/biocoder/run_infer.py b/evaluation/biocoder/run_infer.py
@@ -27,7 +27,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime
 
@@ -285,7 +285,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 metadata.agent_class

diff --git a/evaluation/bird/run_infer.py b/evaluation/bird/run_infer.py
@@ -409,7 +409,7 @@ def execute_sql(db_path, sql):
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 metadata.agent_class
             ],

diff --git a/evaluation/browsing_delegation/run_infer.py b/evaluation/browsing_delegation/run_infer.py
@@ -23,6 +23,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
 
 # Only CodeActAgent can delegate to BrowsingAgent
 SUPPORTED_AGENT_CLS = {'CodeActAgent'}
@@ -76,7 +77,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
         )
     )

diff --git a/evaluation/gaia/run_infer.py b/evaluation/gaia/run_infer.py
@@ -148,7 +148,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                 metadata.agent_class

diff --git a/evaluation/gorilla/run_infer.py b/evaluation/gorilla/run_infer.py
@@ -24,6 +24,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
+from openhands.events.action import MessageAction
 
 AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
     'CodeActAgent': codeact_user_response,
@@ -83,7 +84,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                 metadata.agent_class

diff --git a/evaluation/gpqa/run_infer.py b/evaluation/gpqa/run_infer.py
@@ -219,7 +219,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                 metadata.agent_class

diff --git a/evaluation/humanevalfix/run_infer.py b/evaluation/humanevalfix/run_infer.py
@@ -35,7 +35,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime
 
@@ -237,7 +237,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                 metadata.agent_class

diff --git a/evaluation/logic_reasoning/run_infer.py b/evaluation/logic_reasoning/run_infer.py
@@ -211,7 +211,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                 metadata.agent_class

diff --git a/evaluation/miniwob/run_infer.py b/evaluation/miniwob/run_infer.py
@@ -128,11 +128,12 @@ def process_instance(
 
     runtime = create_runtime(config, sid=env_id)
     task_str = initialize_runtime(runtime)
-
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=task_str,  # take output from initialize_runtime
+            initial_user_action=MessageAction(
+                content=task_str
+            ),  # take output from initialize_runtime
             runtime=runtime,
         )
     )

diff --git a/evaluation/mint/run_infer.py b/evaluation/mint/run_infer.py
@@ -29,6 +29,7 @@
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action import (
     CmdRunAction,
+    MessageAction,
 )
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime
@@ -180,7 +181,7 @@ def process_instance(
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=fake_user_response_fn,
         )

diff --git a/evaluation/ml_bench/run_infer.py b/evaluation/ml_bench/run_infer.py
@@ -39,7 +39,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction
+from openhands.events.action import CmdRunAction, MessageAction
 from openhands.events.observation import CmdOutputObservation
 from openhands.runtime.runtime import Runtime
 
@@ -242,7 +242,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
     state: State | None = asyncio.run(
         run_controller(
             config=config,
-            task_str=instruction,
+            initial_user_action=MessageAction(content=instruction),
             runtime=runtime,
             fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN.get(
                 metadata.agent_class

diff --git a/evaluation/regression/conftest.py b/evaluation/regression/conftest.py
@@ -8,7 +8,7 @@
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 CASES_DIR = os.path.join(SCRIPT_DIR, 'cases')
-AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../../', 'agenthub')
+AGENTHUB_DIR = os.path.join(SCRIPT_DIR, '../', 'agenthub')
 
 
 def agents():

diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py
@@ -8,7 +8,7 @@
 import toml
 from datasets import load_dataset
 
-import agenthub
+import openhands.agenthub
 from evaluation.swe_bench.prompt import CODEACT_SWE_PROMPT
 from evaluation.swe_bench.swe_bench2 import update_issue_description
 from evaluation.swe_bench.test_codes import get_test_code
@@ -31,7 +31,7 @@
 )
 from openhands.core.logger import openhands_logger as logger
 from openhands.core.main import create_runtime, run_controller
-from openhands.events.action import CmdRunAction, IPythonRunCellAction
+from openhands.events.action import CmdRunAction, IPythonRunCellAction, MessageAction
 from openhands.events.observation import CmdOutputObservation, ErrorObservation
 from openhands.events.serialization.event import event_to_dict
 from openhands.runtime.runtime import Runtime
@@ -399,6 +399,7 @@ def process_instance(
         logger.info(f'Starting evaluation for instance {instance.instance_id}.')
 
     runtime = create_runtime(config, sid=instance.instance_id)
+
     try:
         initialize_runtime(runtime, instance)
 
@@ -408,7 +409,7 @@ def process_instance(
         state: State | None = asyncio.run(
             run_controller(
                 config=config,
-                task_str=instruction,
+                initial_user_action=MessageAction(content=instruction),
                 runtime=runtime,
                 fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
                     metadata.agent_class
@@ -503,7 +504,7 @@ def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
         raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
 
     details = {}
-    _agent_cls = agenthub.Agent.get_cls(args.agent_cls)
+    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
     if hasattr(_agent_cls, 'system_message'):
         details['system_message'] = _agent_cls.system_message
     if hasattr(_agent_cls, 'in_context_example'):

diff --git a/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh b/evaluation/swe_bench/scripts/docker/pull_all_eval_docker.sh
@@ -22,11 +22,10 @@ if [ -z "$SET" ]; then
     SET="lite"
 fi
 
-NAMESPACE=$3 # xingyaoww
-if [ -z "$NAMESPACE" ]; then
-    echo "Default to namespace: xingyaoww"
-    NAMESPACE="xingyaoww"
-fi
+# Check if namespace is provided via argument $3, otherwise default to 'xingyaoww'
+NAMESPACE=${3:-xingyaoww}
+
+echo "Using namespace: $NAMESPACE"
 
 if [ "$SET" == "lite" ]; then
     IMAGE_FILE="$(dirname "$0")/all-swebench-lite-instance-images.txt"

diff --git a/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh b/evaluation/swe_bench/scripts/setup/instance_swe_entry.sh
@@ -30,7 +30,7 @@ if [[ -z "$item" ]]; then
   exit 1
 fi
 
-WORKSPACE_NAME=$(echo "$item" | jq -r '.repo + "__" + .version | gsub("/"; "__")')
+WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
 
 echo "WORKSPACE_NAME: $WORKSPACE_NAME"