Skip to content

Commit

Permalink
Merge pull request #1480 from mito-ds/prefix-autocomplete-evals
Browse files Browse the repository at this point in the history
Prefix autocomplete evals
  • Loading branch information
aarondr77 authored Jan 14, 2025
2 parents 79ac78a + 05f586a commit 447a649
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
InlineCodeCompletionTestCase(
name="complete_condition_from_data_understanding",
test_case_core=FILTER_ANNUAL_INCOME_AND_LOAN_CONDITION,
prefix="loans_df = loans_df[loans_df['annual_income'] > 100000 & loans_df['loan_condition'] == 'Bad ",
prefix="loans_df = loans_df[(loans_df['annual_income'] > 100000) & (loans_df['loan_condition'] == Bad ",
suffix="",
type_tags=['code_completion'],
),
Expand Down Expand Up @@ -255,16 +255,16 @@
InlineCodeCompletionTestCase(
name="separate_data_by_column_value_comment",
test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
prefix="""# Create a new dataframe for each income category. ie: loans_df_low, loans_df_medium, etc.""",
prefix="""# Create a new dataframe for each income category. ie: low_df, medium_df, etc.""",
suffix="",
type_tags=['comment_following'],
),

InlineCodeCompletionTestCase(
name="separate_data_by_column_value_prefix",
test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
prefix="""loans_df_low = loans_df[loans_df['income_category'] == 'Low']
loans_df_medium = loans_df[loans_df['income_category'] == 'Medium']""",
prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']
medium_df = loans_df[loans_df['income_category'] == 'Medium']""",
suffix="",
type_tags=['code_completion'],
),
Expand All @@ -274,17 +274,17 @@
test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
prefix="""""",
suffix="""
loans_df_medium = loans_df[loans_df['income_category'] == 'Medium']
loans_df_high = loans_df[loans_df['income_category'] == 'High']""",
medium_df = loans_df[loans_df['income_category'] == 'Medium']
high_df = loans_df[loans_df['income_category'] == 'High']""",
type_tags=['code_completion'],
),

InlineCodeCompletionTestCase(
name="separate_data_by_column_value_prefix_and_suffix",
test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
prefix="""loans_df_low = loans_df[loans_df['income_category'] == 'Low']""",
prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']""",
suffix="""
loans_df_high = loans_df[loans_df['income_category'] == 'High']
high_df = loans_df[loans_df['income_category'] == 'High']
""",
type_tags=['code_completion'],
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# Return the sum of two numbers
def my_sum(a, b):
return a + b
x = my_sum(1, 2)
""",
),
prefix="""# Return the sum of two numbers""",
Expand Down
35 changes: 32 additions & 3 deletions evals/test_runners/code_gen_test_runner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pprint
from typing import Dict, List, Literal, Optional, Union
from evals.ai_api_calls.get_open_ai_completion import get_open_ai_completion
from evals.asserts.equal_outputs import assert_equal_outputs
Expand All @@ -8,12 +9,11 @@
from evals.test_cases.inline_code_completion_tests import INLINE_CODE_COMPLETION_TESTS
from evals.test_runners.utils import exec_code_and_get_globals_and_output
from evals.utils import get_script_from_cells, print_test_case_result_tables
from evals.asserts.equal_globals import assert_equal_globals
from evals.asserts.equal_globals import assert_equal_globals, get_globals_to_compare

def run_chat_tests(test_name: Optional[str], prompt_name: Optional[str], tags: Optional[List[str]]):
_run_code_gen_tests('chat', CHAT_TESTS, CHAT_PROMPT_GENERATORS, test_name, prompt_name, tags)


def run_inline_code_completion_tests(test_name: Optional[str], prompt_name: Optional[str], tags: Optional[List[str]]):
_run_code_gen_tests('inline_code_completion', INLINE_CODE_COMPLETION_TESTS, INLINE_CODE_COMPLETION_PROMPT_GENERATORS, test_name, prompt_name, tags)

Expand Down Expand Up @@ -65,7 +65,7 @@ def run_code_gen_test(
test: Union[ChatTestCase, InlineCodeCompletionTestCase],
prompt_generator: Union[ChatPromptGenerator, InlineCodeCompletionPromptGenerator]
) -> TestCaseResult:
print(f"Running test: {test.name}")
print(f"\n\033[1mRunning test: {test.name}\033[0m")

# Get the script from the cells
current_cell_contents_script = get_script_from_cells(test.test_case_core.notebook_state.cell_contents)
Expand Down Expand Up @@ -97,12 +97,41 @@ def run_code_gen_test(
except Exception as e:
# Fail early if we can't execute the code
print(f"Failed to execute code with error: {e}")
print(f"AI Generated Code: {ai_generated_code}")
print(f"Actual Code: {actual_code}")
print(f"Expected Code: {expected_code}")
return TestCaseResult(test=test, passed=False)

equal_globals = assert_equal_globals(expected_globals, actual_globals, test.test_case_core.variables_to_compare)
equal_outputs = assert_equal_outputs(expected_output, actual_output)

passed = equal_globals and equal_outputs

if not passed:
debug_failed_test_case(test, ai_generated_code, actual_code, expected_code, equal_globals, equal_outputs, expected_globals, actual_globals, expected_output, actual_output)

return TestCaseResult(test=test, passed=passed)


def debug_failed_test_case(
test: Union[ChatTestCase, InlineCodeCompletionTestCase],
ai_generated_code: str,
actual_code: str,
expected_code: str,
equal_globals: bool,
equal_outputs: bool,
expected_globals: Dict[str, str],
actual_globals: Dict[str, str],
expected_output: str,
actual_output: str,
) -> None:


print(f"AI Generated Code: {ai_generated_code}")
print(f"Actual Code: {actual_code}")
print(f"Expected Code: {expected_code}")
print(f"Equal Globals: {equal_globals}")
print(f"Equal Outputs: {equal_outputs}")
if not equal_outputs:
print(f"Expected Output: {expected_output}")
print(f"Actual Output: {actual_output}")

0 comments on commit 447a649

Please sign in to comment.