Merge pull request #1480 from mito-ds/prefix-autocomplete-evals

Prefix autocomplete evals
mito-ds · Jan 14, 2025 · 447a649 · 447a649
2 parents 79ac78a + 05f586a
commit 447a649
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 11 deletions.
diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
@@ -37,7 +37,7 @@
     InlineCodeCompletionTestCase(
         name="complete_condition_from_data_understanding",
         test_case_core=FILTER_ANNUAL_INCOME_AND_LOAN_CONDITION,
-        prefix="loans_df = loans_df[loans_df['annual_income'] > 100000 & loans_df['loan_condition'] == 'Bad ",
+        prefix="loans_df = loans_df[(loans_df['annual_income'] > 100000) & (loans_df['loan_condition'] == Bad ",
         suffix="",
         type_tags=['code_completion'],
     ),
@@ -255,16 +255,16 @@
     InlineCodeCompletionTestCase(
         name="separate_data_by_column_value_comment",
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
-        prefix="""# Create a new dataframe for each income category. ie: loans_df_low, loans_df_medium, etc.""",
+        prefix="""# Create a new dataframe for each income category. ie: low_df, medium_df, etc.""",
         suffix="",
         type_tags=['comment_following'],
     ),
 
     InlineCodeCompletionTestCase(
         name="separate_data_by_column_value_prefix",
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
-        prefix="""loans_df_low = loans_df[loans_df['income_category'] == 'Low']
-loans_df_medium = loans_df[loans_df['income_category'] == 'Medium']""",
+        prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']
+medium_df = loans_df[loans_df['income_category'] == 'Medium']""",
         suffix="",
         type_tags=['code_completion'],
     ),
@@ -274,17 +274,17 @@
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
         prefix="""""",
         suffix="""
-loans_df_medium = loans_df[loans_df['income_category'] == 'Medium']
-loans_df_high = loans_df[loans_df['income_category'] == 'High']""",
+medium_df = loans_df[loans_df['income_category'] == 'Medium']
+high_df = loans_df[loans_df['income_category'] == 'High']""",
         type_tags=['code_completion'],
     ),
 
     InlineCodeCompletionTestCase(
         name="separate_data_by_column_value_prefix_and_suffix",
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
-        prefix="""loans_df_low = loans_df[loans_df['income_category'] == 'Low']""",
+        prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']""",
         suffix="""
-loans_df_high = loans_df[loans_df['income_category'] == 'High']
+high_df = loans_df[loans_df['income_category'] == 'High']
 """,
         type_tags=['code_completion'],
     ),

diff --git a/evals/test_cases/inline_code_completion_tests/function_tests.py b/evals/test_cases/inline_code_completion_tests/function_tests.py
@@ -13,6 +13,8 @@
 # Return the sum of two numbers
 def my_sum(a, b):
     return a + b
+    
+x = my_sum(1, 2)
 """,
         ),
         prefix="""# Return the sum of two numbers""",

diff --git a/evals/test_runners/code_gen_test_runner.py b/evals/test_runners/code_gen_test_runner.py
@@ -1,3 +1,4 @@
+import pprint
 from typing import Dict, List, Literal, Optional, Union
 from evals.ai_api_calls.get_open_ai_completion import get_open_ai_completion
 from evals.asserts.equal_outputs import assert_equal_outputs
@@ -8,12 +9,11 @@
 from evals.test_cases.inline_code_completion_tests import INLINE_CODE_COMPLETION_TESTS
 from evals.test_runners.utils import exec_code_and_get_globals_and_output
 from evals.utils import get_script_from_cells, print_test_case_result_tables
-from evals.asserts.equal_globals import assert_equal_globals
+from evals.asserts.equal_globals import assert_equal_globals, get_globals_to_compare
 
 def run_chat_tests(test_name: Optional[str], prompt_name: Optional[str], tags: Optional[List[str]]):
     _run_code_gen_tests('chat', CHAT_TESTS, CHAT_PROMPT_GENERATORS, test_name, prompt_name, tags)
 
-
 def run_inline_code_completion_tests(test_name: Optional[str], prompt_name: Optional[str], tags: Optional[List[str]]):
     _run_code_gen_tests('inline_code_completion', INLINE_CODE_COMPLETION_TESTS, INLINE_CODE_COMPLETION_PROMPT_GENERATORS, test_name, prompt_name, tags)
 
@@ -65,7 +65,7 @@ def run_code_gen_test(
         test: Union[ChatTestCase, InlineCodeCompletionTestCase], 
         prompt_generator: Union[ChatPromptGenerator, InlineCodeCompletionPromptGenerator]
 ) -> TestCaseResult:
-    print(f"Running test: {test.name}")
+    print(f"\n\033[1mRunning test: {test.name}\033[0m")
 
     # Get the script from the cells
     current_cell_contents_script = get_script_from_cells(test.test_case_core.notebook_state.cell_contents)
@@ -97,12 +97,41 @@ def run_code_gen_test(
     except Exception as e:
         # Fail early if we can't execute the code
         print(f"Failed to execute code with error: {e}")
+        print(f"AI Generated Code: {ai_generated_code}")
+        print(f"Actual Code: {actual_code}")
+        print(f"Expected Code: {expected_code}")
         return TestCaseResult(test=test, passed=False)
 
     equal_globals = assert_equal_globals(expected_globals, actual_globals, test.test_case_core.variables_to_compare)
     equal_outputs = assert_equal_outputs(expected_output, actual_output)
 
     passed = equal_globals and equal_outputs
 
+    if not passed:
+        debug_failed_test_case(test, ai_generated_code, actual_code, expected_code, equal_globals, equal_outputs, expected_globals, actual_globals, expected_output, actual_output)
+
     return TestCaseResult(test=test, passed=passed)
 
+
+def debug_failed_test_case(
+        test: Union[ChatTestCase, InlineCodeCompletionTestCase],
+        ai_generated_code: str, 
+        actual_code: str,
+        expected_code: str, 
+        equal_globals: bool, 
+        equal_outputs: bool, 
+        expected_globals: Dict[str, str],
+        actual_globals: Dict[str, str],
+        expected_output: str,
+        actual_output: str,
+    ) -> None:
+
+
+    print(f"AI Generated Code: {ai_generated_code}")
+    print(f"Actual Code: {actual_code}")
+    print(f"Expected Code: {expected_code}")
+    print(f"Equal Globals: {equal_globals}")
+    print(f"Equal Outputs: {equal_outputs}")
+    if not equal_outputs:
+        print(f"Expected Output: {expected_output}")
+        print(f"Actual Output: {actual_output}")