From 39afe88a46881ebba5b100b441f8c4d93cdfa157 Mon Sep 17 00:00:00 2001 From: Engel Nyst Date: Sat, 22 Jun 2024 23:41:52 +0200 Subject: [PATCH] TEMPORARY swe-bench --- evaluation/swe_bench/run_infer.py | 33 ++++++++++- evaluation/swe_bench/scripts/run_infer.sh | 2 +- evaluation/swe_bench/swe_env_box.py | 56 ++++++++++++++++++- opendevin/core/logger.py | 44 +++++++++++---- .../runtime/docker/image_agnostic_util.py | 2 +- opendevin/runtime/docker/ssh_box.py | 18 +++--- opendevin/runtime/plugins/jupyter/setup.sh | 2 +- 7 files changed, 135 insertions(+), 22 deletions(-) mode change 100644 => 100755 evaluation/swe_bench/scripts/run_infer.sh diff --git a/evaluation/swe_bench/run_infer.py b/evaluation/swe_bench/run_infer.py index 386353c858..90d0384df5 100644 --- a/evaluation/swe_bench/run_infer.py +++ b/evaluation/swe_bench/run_infer.py @@ -18,7 +18,13 @@ from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox from opendevin.controller.state.state import State from opendevin.core.config import args, config, get_llm_config_arg -from opendevin.core.logger import get_console_handler +from opendevin.core.logger import ( + get_console_handler, + get_llm_prompt_file_handler, + get_llm_response_file_handler, + llm_prompt_logger, + llm_response_logger, +) from opendevin.core.logger import opendevin_logger as logger from opendevin.core.main import main from opendevin.events.action import MessageAction @@ -226,6 +232,31 @@ def process_instance( logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') ) logger.addHandler(file_handler) + + # prompt logger + directory = os.path.join(eval_output_dir, 'infer_logs') + sid = f'inst_{instance.instance_id}' + for handler in llm_prompt_logger.handlers[:]: + llm_prompt_logger.removeHandler(handler) + prompt_file_handler = get_llm_prompt_file_handler( + sid=sid, with_date=False, directory=directory + ) + llm_prompt_logger.addHandler(prompt_file_handler) + llm_prompt_logger.setLevel(logging.DEBUG) + llm_prompt_logger.propagate = False + prompt_file_handler.setFormatter(logging.Formatter('%(message)s')) + + # response logger + for handler in llm_response_logger.handlers[:]: + llm_response_logger.removeHandler(handler) + response_file_handler = get_llm_response_file_handler( + sid=sid, with_date=False, directory=directory + ) + llm_response_logger.addHandler(response_file_handler) + llm_response_logger.setLevel(logging.DEBUG) + llm_response_logger.propagate = False + response_file_handler.setFormatter(logging.Formatter('%(message)s')) + else: logger.info(f'Starting evaluation for instance {instance.instance_id}.') diff --git a/evaluation/swe_bench/scripts/run_infer.sh b/evaluation/swe_bench/scripts/run_infer.sh old mode 100644 new mode 100755 index 17fde504a4..6c72db0a3b --- a/evaluation/swe_bench/scripts/run_infer.sh +++ b/evaluation/swe_bench/scripts/run_infer.sh @@ -34,7 +34,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG" # Default to use Hint if [ -z "$USE_HINT_TEXT" ]; then - export USE_HINT_TEXT=true + export USE_HINT_TEXT=false fi echo "USE_HINT_TEXT: $USE_HINT_TEXT" EVAL_NOTE="$AGENT_VERSION" diff --git a/evaluation/swe_bench/swe_env_box.py b/evaluation/swe_bench/swe_env_box.py index 051c06d8fb..5e94da07cb 100644 --- a/evaluation/swe_bench/swe_env_box.py +++ b/evaluation/swe_bench/swe_env_box.py @@ -89,6 +89,9 @@ def get_box_for_instance( try: config.workspace_base = workspace_mount_path config.workspace_mount_path = workspace_mount_path + logger.warning( + f"{instance['instance_id']} : setting workspace_base and workspace_mount_path to {workspace_mount_path}" + ) # linting python after editing helps LLM fix indentations config.enable_auto_lint = True @@ -153,6 +156,13 @@ def get_diff_patch(self): return git_patch +def print_env_vars(sandbox): + env_vars = ['REPO_PATH', 'SWE_TASK_DIR', 'TEST_CMD'] + for var in env_vars: + exit_code, output = sandbox.execute(f'echo ${var}') + logger.info(f'{var}: {output.strip()}') + + if __name__ == '__main__': # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing # so we don't need to manage file uploading to OpenDevin's repo @@ -160,7 +170,10 @@ def get_diff_patch(self): swe_bench_tests = dataset['test'].to_pandas() # INSTANCE_ID = 'django__django-11099' - INSTANCE_ID = 'astropy__astropy-12907' + # INSTANCE_ID = 'astropy__astropy-12907' + # failures: + # INSTANCE_ID = 'psf__requests-2317' + INSTANCE_ID = 'scikit-learn__scikit-learn-13142' swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID] EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict() @@ -174,6 +187,47 @@ def get_diff_patch(self): assert exit_code == 0, 'Failed to cd $REPO_PATH' logger.info(f'cd $REPO_PATH: {output}') + print_env_vars(sandbox) + + # Reset the repo + exit_code, output = sandbox.execute('git reset --hard') + assert exit_code == 0, 'Failed to reset the repo' + logger.info(f'git reset --hard: {output}') + + exit_code, output = sandbox.execute('cat $SWE_TASK_DIR/test.patch') + logger.info(f'Content of test.patch:\n{output}') + + exit_code, output = sandbox.execute('ls -l $SWE_TASK_DIR/test.patch') + logger.info(f'File permissions of test.patch: {output}') + + exit_code, output = sandbox.execute('ls -la $REPO_PATH') + logger.info(f'Repository file permissions:\n{output}') + + # exit_code, output = sandbox.execute('ls -la $REPO_PATH/.git') + # logger.info(f'Git directory permissions:\n{output}') + + # exit_code, output = sandbox.execute('git --version && git config --list') + # logger.info(f'Git version and config:\n{output}') + + # exit_code, output = sandbox.execute('patch -p1 < $SWE_TASK_DIR/test.patch') + # logger.info(f'Manual patch application:\n{output}') + + exit_code, output = sandbox.execute( + 'git apply --verbose $SWE_TASK_DIR/test.patch test_requests.py' + ) + logger.info(f'Applying patch to specific file:\n{output}') + + exit_code, output = sandbox.execute('git status') + logger.info(f'Git status before patch:\n{output}') + + # exit_code, output = sandbox.execute('patch -p1 < $SWE_TASK_DIR/test.patch') + # logger.info(f'Manual patch application:\n{output}') + + # Reset the repo + exit_code, output = sandbox.execute('git reset --hard') + assert exit_code == 0, 'Failed to reset the repo' + logger.info(f'git reset --hard: {output}') + # apply test patch exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch') assert exit_code == 0, 'Failed to apply test patch' diff --git a/opendevin/core/logger.py b/opendevin/core/logger.py index 737c762508..52e3204d5a 100644 --- a/opendevin/core/logger.py +++ b/opendevin/core/logger.py @@ -184,7 +184,15 @@ class LlmFileHandler(logging.FileHandler): # LLM prompt and response logging """ - def __init__(self, filename, mode='a', encoding='utf-8', delay=False): + def __init__( + self, + filename, + mode='a', + encoding='utf-8', + with_date: bool = False, + delay=False, + directory: str | None = None, + ): """ Initializes an instance of LlmFileHandler. @@ -196,11 +204,15 @@ def __init__(self, filename, mode='a', encoding='utf-8', delay=False): """ self.filename = filename self.message_counter = 1 - if config.debug: + if config.debug and with_date: self.session = datetime.now().strftime('%y-%m-%d_%H-%M') else: - self.session = 'default' - self.log_directory = os.path.join(os.getcwd(), 'logs', 'llm', self.session) + self.session = '' + self.log_directory = ( + os.path.join(os.getcwd(), 'logs', 'llm', self.session) + if directory is None + else directory + ) os.makedirs(self.log_directory, exist_ok=True) if not config.debug: # Clear the log directory if not in debug mode @@ -232,21 +244,33 @@ def emit(self, record): self.message_counter += 1 -def get_llm_prompt_file_handler(): +def get_llm_prompt_file_handler( + sid: str = '', with_date: bool = False, directory: str | None = None +): """ Returns a file handler for LLM prompt logging. """ - llm_prompt_file_handler = LlmFileHandler('prompt', delay=True) + filename = f'prompt_{sid}' if sid else 'prompt' + + llm_prompt_file_handler = LlmFileHandler( + filename=filename, with_date=with_date, delay=True, directory=directory + ) llm_prompt_file_handler.setFormatter(llm_formatter) llm_prompt_file_handler.setLevel(logging.DEBUG) return llm_prompt_file_handler -def get_llm_response_file_handler(): +def get_llm_response_file_handler( + sid: str = '', with_date: bool = False, directory: str | None = None +): """ Returns a file handler for LLM response logging. """ - llm_response_file_handler = LlmFileHandler('response', delay=True) + filename = f'response_{sid}' if sid else 'response' + + llm_response_file_handler = LlmFileHandler( + filename=filename, with_date=with_date, delay=True, directory=directory + ) llm_response_file_handler.setFormatter(llm_formatter) llm_response_file_handler.setLevel(logging.DEBUG) return llm_response_file_handler @@ -255,9 +279,9 @@ def get_llm_response_file_handler(): llm_prompt_logger = logging.getLogger('prompt') llm_prompt_logger.propagate = False llm_prompt_logger.setLevel(logging.DEBUG) -llm_prompt_logger.addHandler(get_llm_prompt_file_handler()) +llm_prompt_logger.addHandler(get_llm_prompt_file_handler(with_date=False)) llm_response_logger = logging.getLogger('response') llm_response_logger.propagate = False llm_response_logger.setLevel(logging.DEBUG) -llm_response_logger.addHandler(get_llm_response_file_handler()) +llm_response_logger.addHandler(get_llm_response_file_handler(with_date=False)) diff --git a/opendevin/runtime/docker/image_agnostic_util.py b/opendevin/runtime/docker/image_agnostic_util.py index 7d6879d9dc..45dba43df6 100644 --- a/opendevin/runtime/docker/image_agnostic_util.py +++ b/opendevin/runtime/docker/image_agnostic_util.py @@ -14,7 +14,7 @@ def generate_dockerfile_content(base_image: str) -> str: # FIXME: Remove the requirement of ssh in future version dockerfile_content = ( f'FROM {base_image}\n' - 'RUN apt update && apt install -y openssh-server wget sudo\n' + 'RUN apt update && apt install -y openssh-server wget sudo net-tools iproute2\n' 'RUN mkdir -p -m0755 /var/run/sshd\n' 'RUN mkdir -p /opendevin && mkdir -p /opendevin/logs && chmod 777 /opendevin/logs\n' 'RUN wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"\n' diff --git a/opendevin/runtime/docker/ssh_box.py b/opendevin/runtime/docker/ssh_box.py index 510a87615b..7862ca49bb 100644 --- a/opendevin/runtime/docker/ssh_box.py +++ b/opendevin/runtime/docker/ssh_box.py @@ -348,14 +348,18 @@ def setup_user(self): ) # check the miniforge3 directory exist exit_code, logs = self.container.exec_run( - ['/bin/bash', '-c', '[ -d "/opendevin/miniforge3" ] && exit 0 || exit 1'], + [ + '/bin/bash', + '-c', + '[ -d "/opendevin/miniforge3" ] && exit 0 || exit 1', + ], workdir=self.sandbox_workspace_dir, environment=self._env, ) if exit_code != 0: if exit_code == 1: raise Exception( - f'OPENDEVIN_PYTHON_INTERPRETER is not usable. Please pull the latest Docker image: docker pull ghcr.io/opendevin/sandbox:main' + 'OPENDEVIN_PYTHON_INTERPRETER is not usable. Please pull the latest Docker image: docker pull ghcr.io/opendevin/sandbox:main' ) else: raise Exception( @@ -487,17 +491,17 @@ def execute( # once out, make sure that we have *every* output, we while loop until we get an empty output while True: - logger.debug('WAITING FOR .prompt()') + # logger.debug('WAITING FOR .prompt()') self.ssh.sendline('\n') timeout_not_reached = self.ssh.prompt(timeout=1) if not timeout_not_reached: logger.debug('TIMEOUT REACHED') break - logger.debug('WAITING FOR .before') + # logger.debug('WAITING FOR .before') output = self.ssh.before - logger.debug( - f'WAITING FOR END OF command output ({bool(output)}): {output}' - ) + # logger.debug( + # f'WAITING FOR END OF command output ({bool(output)}): {output}' + # ) if isinstance(output, str) and output.strip() == '': break command_output += output diff --git a/opendevin/runtime/plugins/jupyter/setup.sh b/opendevin/runtime/plugins/jupyter/setup.sh index e54649303e..5e8d06b682 100755 --- a/opendevin/runtime/plugins/jupyter/setup.sh +++ b/opendevin/runtime/plugins/jupyter/setup.sh @@ -41,7 +41,7 @@ find_free_port() { local end_port="${2:-65535}" for port in $(seq $start_port $end_port); do - if ! ss -tuln | awk '{print $5}' | grep -q ":$port$"; then + if ! netstat -tuln | awk '{print $4}' | grep -q ":$port$"; then echo $port return fi