Skip to content

Commit

Permalink
TEMPORARY swe-bench
Browse files Browse the repository at this point in the history
  • Loading branch information
enyst committed Jun 26, 2024
1 parent df00b88 commit 39afe88
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 22 deletions.
33 changes: 32 additions & 1 deletion evaluation/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@
from evaluation.swe_bench.swe_env_box import SWEBenchSSHBox
from opendevin.controller.state.state import State
from opendevin.core.config import args, config, get_llm_config_arg
from opendevin.core.logger import get_console_handler
from opendevin.core.logger import (
get_console_handler,
get_llm_prompt_file_handler,
get_llm_response_file_handler,
llm_prompt_logger,
llm_response_logger,
)
from opendevin.core.logger import opendevin_logger as logger
from opendevin.core.main import main
from opendevin.events.action import MessageAction
Expand Down Expand Up @@ -226,6 +232,31 @@ def process_instance(
logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
)
logger.addHandler(file_handler)

# prompt logger
directory = os.path.join(eval_output_dir, 'infer_logs')
sid = f'inst_{instance.instance_id}'
for handler in llm_prompt_logger.handlers[:]:
llm_prompt_logger.removeHandler(handler)
prompt_file_handler = get_llm_prompt_file_handler(
sid=sid, with_date=False, directory=directory
)
llm_prompt_logger.addHandler(prompt_file_handler)
llm_prompt_logger.setLevel(logging.DEBUG)
llm_prompt_logger.propagate = False
prompt_file_handler.setFormatter(logging.Formatter('%(message)s'))

# response logger
for handler in llm_response_logger.handlers[:]:
llm_response_logger.removeHandler(handler)
response_file_handler = get_llm_response_file_handler(
sid=sid, with_date=False, directory=directory
)
llm_response_logger.addHandler(response_file_handler)
llm_response_logger.setLevel(logging.DEBUG)
llm_response_logger.propagate = False
response_file_handler.setFormatter(logging.Formatter('%(message)s'))

else:
logger.info(f'Starting evaluation for instance {instance.instance_id}.')

Expand Down
2 changes: 1 addition & 1 deletion evaluation/swe_bench/scripts/run_infer.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"

# Default to use Hint
if [ -z "$USE_HINT_TEXT" ]; then
export USE_HINT_TEXT=true
export USE_HINT_TEXT=false
fi
echo "USE_HINT_TEXT: $USE_HINT_TEXT"
EVAL_NOTE="$AGENT_VERSION"
Expand Down
56 changes: 55 additions & 1 deletion evaluation/swe_bench/swe_env_box.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ def get_box_for_instance(
try:
config.workspace_base = workspace_mount_path
config.workspace_mount_path = workspace_mount_path
logger.warning(
f"{instance['instance_id']} : setting workspace_base and workspace_mount_path to {workspace_mount_path}"
)

# linting python after editing helps LLM fix indentations
config.enable_auto_lint = True
Expand Down Expand Up @@ -153,14 +156,24 @@ def get_diff_patch(self):
return git_patch


def print_env_vars(sandbox):
env_vars = ['REPO_PATH', 'SWE_TASK_DIR', 'TEST_CMD']
for var in env_vars:
exit_code, output = sandbox.execute(f'echo ${var}')
logger.info(f'{var}: {output.strip()}')


if __name__ == '__main__':
# NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
# so we don't need to manage file uploading to OpenDevin's repo
dataset = load_dataset('princeton-nlp/SWE-bench_Lite')
swe_bench_tests = dataset['test'].to_pandas()

# INSTANCE_ID = 'django__django-11099'
INSTANCE_ID = 'astropy__astropy-12907'
# INSTANCE_ID = 'astropy__astropy-12907'
# failures:
# INSTANCE_ID = 'psf__requests-2317'
INSTANCE_ID = 'scikit-learn__scikit-learn-13142'
swe_bench_tests = swe_bench_tests[swe_bench_tests['instance_id'] == INSTANCE_ID]
EXAMPLE_INSTANCE = swe_bench_tests.iloc[0].to_dict()

Expand All @@ -174,6 +187,47 @@ def get_diff_patch(self):
assert exit_code == 0, 'Failed to cd $REPO_PATH'
logger.info(f'cd $REPO_PATH: {output}')

print_env_vars(sandbox)

# Reset the repo
exit_code, output = sandbox.execute('git reset --hard')
assert exit_code == 0, 'Failed to reset the repo'
logger.info(f'git reset --hard: {output}')

exit_code, output = sandbox.execute('cat $SWE_TASK_DIR/test.patch')
logger.info(f'Content of test.patch:\n{output}')

exit_code, output = sandbox.execute('ls -l $SWE_TASK_DIR/test.patch')
logger.info(f'File permissions of test.patch: {output}')

exit_code, output = sandbox.execute('ls -la $REPO_PATH')
logger.info(f'Repository file permissions:\n{output}')

# exit_code, output = sandbox.execute('ls -la $REPO_PATH/.git')
# logger.info(f'Git directory permissions:\n{output}')

# exit_code, output = sandbox.execute('git --version && git config --list')
# logger.info(f'Git version and config:\n{output}')

# exit_code, output = sandbox.execute('patch -p1 < $SWE_TASK_DIR/test.patch')
# logger.info(f'Manual patch application:\n{output}')

exit_code, output = sandbox.execute(
'git apply --verbose $SWE_TASK_DIR/test.patch test_requests.py'
)
logger.info(f'Applying patch to specific file:\n{output}')

exit_code, output = sandbox.execute('git status')
logger.info(f'Git status before patch:\n{output}')

# exit_code, output = sandbox.execute('patch -p1 < $SWE_TASK_DIR/test.patch')
# logger.info(f'Manual patch application:\n{output}')

# Reset the repo
exit_code, output = sandbox.execute('git reset --hard')
assert exit_code == 0, 'Failed to reset the repo'
logger.info(f'git reset --hard: {output}')

# apply test patch
exit_code, output = sandbox.execute('git apply $SWE_TASK_DIR/test.patch')
assert exit_code == 0, 'Failed to apply test patch'
Expand Down
44 changes: 34 additions & 10 deletions opendevin/core/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,15 @@ class LlmFileHandler(logging.FileHandler):
# LLM prompt and response logging
"""

def __init__(self, filename, mode='a', encoding='utf-8', delay=False):
def __init__(
self,
filename,
mode='a',
encoding='utf-8',
with_date: bool = False,
delay=False,
directory: str | None = None,
):
"""
Initializes an instance of LlmFileHandler.
Expand All @@ -196,11 +204,15 @@ def __init__(self, filename, mode='a', encoding='utf-8', delay=False):
"""
self.filename = filename
self.message_counter = 1
if config.debug:
if config.debug and with_date:
self.session = datetime.now().strftime('%y-%m-%d_%H-%M')
else:
self.session = 'default'
self.log_directory = os.path.join(os.getcwd(), 'logs', 'llm', self.session)
self.session = ''
self.log_directory = (
os.path.join(os.getcwd(), 'logs', 'llm', self.session)
if directory is None
else directory
)
os.makedirs(self.log_directory, exist_ok=True)
if not config.debug:
# Clear the log directory if not in debug mode
Expand Down Expand Up @@ -232,21 +244,33 @@ def emit(self, record):
self.message_counter += 1


def get_llm_prompt_file_handler():
def get_llm_prompt_file_handler(
sid: str = '', with_date: bool = False, directory: str | None = None
):
"""
Returns a file handler for LLM prompt logging.
"""
llm_prompt_file_handler = LlmFileHandler('prompt', delay=True)
filename = f'prompt_{sid}' if sid else 'prompt'

llm_prompt_file_handler = LlmFileHandler(
filename=filename, with_date=with_date, delay=True, directory=directory
)
llm_prompt_file_handler.setFormatter(llm_formatter)
llm_prompt_file_handler.setLevel(logging.DEBUG)
return llm_prompt_file_handler


def get_llm_response_file_handler():
def get_llm_response_file_handler(
sid: str = '', with_date: bool = False, directory: str | None = None
):
"""
Returns a file handler for LLM response logging.
"""
llm_response_file_handler = LlmFileHandler('response', delay=True)
filename = f'response_{sid}' if sid else 'response'

llm_response_file_handler = LlmFileHandler(
filename=filename, with_date=with_date, delay=True, directory=directory
)
llm_response_file_handler.setFormatter(llm_formatter)
llm_response_file_handler.setLevel(logging.DEBUG)
return llm_response_file_handler
Expand All @@ -255,9 +279,9 @@ def get_llm_response_file_handler():
llm_prompt_logger = logging.getLogger('prompt')
llm_prompt_logger.propagate = False
llm_prompt_logger.setLevel(logging.DEBUG)
llm_prompt_logger.addHandler(get_llm_prompt_file_handler())
llm_prompt_logger.addHandler(get_llm_prompt_file_handler(with_date=False))

llm_response_logger = logging.getLogger('response')
llm_response_logger.propagate = False
llm_response_logger.setLevel(logging.DEBUG)
llm_response_logger.addHandler(get_llm_response_file_handler())
llm_response_logger.addHandler(get_llm_response_file_handler(with_date=False))
2 changes: 1 addition & 1 deletion opendevin/runtime/docker/image_agnostic_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def generate_dockerfile_content(base_image: str) -> str:
# FIXME: Remove the requirement of ssh in future version
dockerfile_content = (
f'FROM {base_image}\n'
'RUN apt update && apt install -y openssh-server wget sudo\n'
'RUN apt update && apt install -y openssh-server wget sudo net-tools iproute2\n'
'RUN mkdir -p -m0755 /var/run/sshd\n'
'RUN mkdir -p /opendevin && mkdir -p /opendevin/logs && chmod 777 /opendevin/logs\n'
'RUN wget "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"\n'
Expand Down
18 changes: 11 additions & 7 deletions opendevin/runtime/docker/ssh_box.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,14 +348,18 @@ def setup_user(self):
)
# check the miniforge3 directory exist
exit_code, logs = self.container.exec_run(
['/bin/bash', '-c', '[ -d "/opendevin/miniforge3" ] && exit 0 || exit 1'],
[
'/bin/bash',
'-c',
'[ -d "/opendevin/miniforge3" ] && exit 0 || exit 1',
],
workdir=self.sandbox_workspace_dir,
environment=self._env,
)
if exit_code != 0:
if exit_code == 1:
raise Exception(
f'OPENDEVIN_PYTHON_INTERPRETER is not usable. Please pull the latest Docker image: docker pull ghcr.io/opendevin/sandbox:main'
'OPENDEVIN_PYTHON_INTERPRETER is not usable. Please pull the latest Docker image: docker pull ghcr.io/opendevin/sandbox:main'
)
else:
raise Exception(
Expand Down Expand Up @@ -487,17 +491,17 @@ def execute(

# once out, make sure that we have *every* output, we while loop until we get an empty output
while True:
logger.debug('WAITING FOR .prompt()')
# logger.debug('WAITING FOR .prompt()')
self.ssh.sendline('\n')
timeout_not_reached = self.ssh.prompt(timeout=1)
if not timeout_not_reached:
logger.debug('TIMEOUT REACHED')
break
logger.debug('WAITING FOR .before')
# logger.debug('WAITING FOR .before')
output = self.ssh.before
logger.debug(
f'WAITING FOR END OF command output ({bool(output)}): {output}'
)
# logger.debug(
# f'WAITING FOR END OF command output ({bool(output)}): {output}'
# )
if isinstance(output, str) and output.strip() == '':
break
command_output += output
Expand Down
2 changes: 1 addition & 1 deletion opendevin/runtime/plugins/jupyter/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ find_free_port() {
local end_port="${2:-65535}"

for port in $(seq $start_port $end_port); do
if ! ss -tuln | awk '{print $5}' | grep -q ":$port$"; then
if ! netstat -tuln | awk '{print $4}' | grep -q ":$port$"; then
echo $port
return
fi
Expand Down

0 comments on commit 39afe88

Please sign in to comment.