Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor history/event stream #3808

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions evaluation/EDA/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand All @@ -33,7 +34,7 @@ def codeact_user_response_eda(state: State) -> str:

# retrieve the latest model message from history
if state.history:
model_guess = state.history.get_last_agent_message()
model_guess = state.get_last_agent_message()

assert game is not None, 'Game is not initialized.'
msg = game.generate_user_response(model_guess)
Expand Down Expand Up @@ -137,7 +138,7 @@ def process_instance(
if state is None:
raise ValueError('State should not be None.')

final_message = state.history.get_last_agent_message()
final_message = state.get_last_agent_message()

logger.info(f'Final message: {final_message} | Ground truth: {instance["text"]}')
test_result = game.reward()
Expand All @@ -146,7 +147,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
5 changes: 3 additions & 2 deletions evaluation/agent_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -240,7 +241,7 @@ def process_instance(
raw_ans = ''

# retrieve the last agent message or thought
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if event.source == 'agent':
if isinstance(event, AgentFinishAction):
raw_ans = event.thought
Expand Down Expand Up @@ -269,7 +270,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

metrics = state.metrics.get() if state.metrics else None

Expand Down
3 changes: 2 additions & 1 deletion evaluation/aider_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -245,7 +246,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)
metrics = state.metrics.get() if state.metrics else None

# Save the output
Expand Down
3 changes: 2 additions & 1 deletion evaluation/biocoder/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -298,7 +299,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

test_result['generated'] = test_result['metadata']['1_copy_change_code']

Expand Down
5 changes: 3 additions & 2 deletions evaluation/bird/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -45,7 +46,7 @@ def codeact_user_response(state: State) -> str:
# check if the agent has tried to talk to the user 3 times, if so, let the agent know it can give up
user_msgs = [
event
for event in state.history.get_events()
for event in state.history
if isinstance(event, MessageAction) and event.source == 'user'
]
if len(user_msgs) > 2:
Expand Down Expand Up @@ -429,7 +430,7 @@ def execute_sql(db_path, sql):
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
3 changes: 2 additions & 1 deletion evaluation/browsing_delegation/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -89,7 +90,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# find the last delegate action
last_delegate_action = None
Expand Down
5 changes: 3 additions & 2 deletions evaluation/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -164,7 +165,7 @@ def process_instance(

model_answer_raw = ''
# get the last message or thought from the agent
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if event.source == 'agent':
if isinstance(event, AgentFinishAction):
model_answer_raw = event.thought
Expand Down Expand Up @@ -201,7 +202,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
5 changes: 3 additions & 2 deletions evaluation/gorilla/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -99,7 +100,7 @@ def process_instance(
raise ValueError('State should not be None.')

# retrieve the last message from the agent
model_answer_raw = state.history.get_last_agent_message()
model_answer_raw = state.get_last_agent_message()

# attempt to parse model_answer
ast_eval_fn = instance['ast_eval']
Expand All @@ -112,7 +113,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

output = EvalOutput(
instance_id=instance_id,
Expand Down
5 changes: 3 additions & 2 deletions evaluation/gpqa/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -243,7 +244,7 @@ def process_instance(
'C': False,
'D': False,
}
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if (
isinstance(event, AgentFinishAction)
and event.source != 'user'
Expand Down Expand Up @@ -299,7 +300,7 @@ def process_instance(
instance_id=str(instance.instance_id),
instruction=instruction,
metadata=metadata,
history=state.history.compatibility_for_eval_history_pairs(),
history=compatibility_for_eval_history_pairs(state.history),
metrics=metrics,
error=state.last_error if state and state.last_error else None,
test_result={
Expand Down
3 changes: 2 additions & 1 deletion evaluation/humanevalfix/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -253,7 +254,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
2 changes: 1 addition & 1 deletion evaluation/integration_tests/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def process_instance(
# # result evaluation
# # =============================================

histories = state.history.get_events()
histories = state.history
test_result: TestResult = test_class.verify_result(runtime, histories)
metrics = state.metrics.get() if state.metrics else None

Expand Down
5 changes: 3 additions & 2 deletions evaluation/logic_reasoning/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -223,7 +224,7 @@ def process_instance(
raise ValueError('State should not be None.')

final_message = ''
for event in state.history.get_events(reverse=True):
for event in reversed(state.history):
if isinstance(event, AgentFinishAction):
final_message = event.thought
break
Expand All @@ -245,7 +246,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
5 changes: 3 additions & 2 deletions evaluation/miniwob/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -150,7 +151,7 @@ def process_instance(

# Instruction is the first message from the USER
instruction = ''
for event in state.history.get_events():
for event in state.history:
if isinstance(event, MessageAction):
instruction = event.content
break
Expand All @@ -162,7 +163,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
9 changes: 7 additions & 2 deletions evaluation/mint/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from evaluation.utils.shared import (
EvalMetadata,
EvalOutput,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand All @@ -28,6 +29,7 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.events.action import (
Action,
CmdRunAction,
MessageAction,
)
Expand All @@ -44,7 +46,10 @@ def codeact_user_response_mint(state: State, task: Task, task_config: dict[str,
task=task,
task_config=task_config,
)
last_action = state.history.get_last_action()
last_action = next(
(event for event in reversed(state.history) if isinstance(event, Action)),
None,
)
result_state: TaskState = env.step(last_action.message or '')

state.extra_data['task_state'] = result_state
Expand Down Expand Up @@ -200,7 +205,7 @@ def process_instance(
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
3 changes: 2 additions & 1 deletion evaluation/ml_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
EvalMetadata,
EvalOutput,
codeact_user_response,
compatibility_for_eval_history_pairs,
make_metadata,
prepare_dataset,
reset_logger_for_multiprocessing,
Expand Down Expand Up @@ -254,7 +255,7 @@ def process_instance(instance: Any, metadata: EvalMetadata, reset_logger: bool =
# history is now available as a stream of events, rather than list of pairs of (Action, Observation)
# for compatibility with the existing output format, we can remake the pairs here
# remove when it becomes unnecessary
histories = state.history.compatibility_for_eval_history_pairs()
histories = compatibility_for_eval_history_pairs(state.history)

# Save the output
output = EvalOutput(
Expand Down
Loading