From bcc46201b7a8f1c47468a1cb35404fe55a4f796a Mon Sep 17 00:00:00 2001
From: Aaron Diamond-Reivich <aarondr77@gmail.com>
Date: Mon, 13 Jan 2025 14:00:30 -0500
Subject: [PATCH 1/8] evals: improve output for evals understanding

---
 evals/test_runners/code_gen_test_runner.py | 40 ++++++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/evals/test_runners/code_gen_test_runner.py b/evals/test_runners/code_gen_test_runner.py
index 980f0b069..27e882a35 100644
--- a/evals/test_runners/code_gen_test_runner.py
+++ b/evals/test_runners/code_gen_test_runner.py
@@ -1,3 +1,4 @@
+import pprint
 from typing import Dict, List, Literal, Optional, Union
 from evals.ai_api_calls.get_open_ai_completion import get_open_ai_completion
 from evals.asserts.equal_outputs import assert_equal_outputs
@@ -8,12 +9,11 @@
 from evals.test_cases.inline_code_completion_tests import INLINE_CODE_COMPLETION_TESTS
 from evals.test_runners.utils import exec_code_and_get_globals_and_output
 from evals.utils import get_script_from_cells, print_test_case_result_tables
-from evals.asserts.equal_globals import assert_equal_globals
+from evals.asserts.equal_globals import assert_equal_globals, get_globals_to_compare
 
 def run_chat_tests(test_name: Optional[str], prompt_name: Optional[str], tags: Optional[List[str]]):
     _run_code_gen_tests('chat', CHAT_TESTS, CHAT_PROMPT_GENERATORS, test_name, prompt_name, tags)
 
-    
 def run_inline_code_completion_tests(test_name: Optional[str], prompt_name: Optional[str], tags: Optional[List[str]]):
     _run_code_gen_tests('inline_code_completion', INLINE_CODE_COMPLETION_TESTS, INLINE_CODE_COMPLETION_PROMPT_GENERATORS, test_name, prompt_name, tags)
 
@@ -65,7 +65,7 @@ def run_code_gen_test(
         test: Union[ChatTestCase, InlineCodeCompletionTestCase], 
         prompt_generator: Union[ChatPromptGenerator, InlineCodeCompletionPromptGenerator]
 ) -> TestCaseResult:
-    print(f"Running test: {test.name}")
+    print(f"\nRunning test: {test.name}")
 
     # Get the script from the cells
     current_cell_contents_script = get_script_from_cells(test.test_case_core.notebook_state.cell_contents)
@@ -97,6 +97,9 @@ def run_code_gen_test(
     except Exception as e:
         # Fail early if we can't execute the code
         print(f"Failed to execute code with error: {e}")
+        print(f"AI Generated Code: {ai_generated_code}")
+        print(f"Actual Code: {actual_code}")
+        print(f"Expected Code: {expected_code}")
         return TestCaseResult(test=test, passed=False)
 
     equal_globals = assert_equal_globals(expected_globals, actual_globals, test.test_case_core.variables_to_compare)
@@ -104,5 +107,36 @@ def run_code_gen_test(
 
     passed = equal_globals and equal_outputs
 
+    if not passed:
+        debug_failed_test_case(test, ai_generated_code, actual_code, expected_code, equal_globals, equal_outputs, expected_globals, actual_globals, expected_output, actual_output)
+
     return TestCaseResult(test=test, passed=passed)
 
+
+def debug_failed_test_case(
+        test: Union[ChatTestCase, InlineCodeCompletionTestCase],
+        ai_generated_code: str, 
+        actual_code: str,
+        expected_code: str, 
+        equal_globals: bool, 
+        equal_outputs: bool, 
+        expected_globals: Dict[str, str],
+        actual_globals: Dict[str, str],
+        expected_output: str,
+        actual_output: str,
+    ) -> None:
+
+
+    print(f"AI Generated Code: {ai_generated_code}")
+    print(f"Actual Code: {actual_code}")
+    print(f"Expected Code: {expected_code}")
+    print(f"Equal Globals: {equal_globals}")
+    if not equal_globals:
+        expected_globals_to_compare = get_globals_to_compare(expected_globals, test.test_case_core.variables_to_compare)
+        actual_globals_to_compare = get_globals_to_compare(actual_globals, test.test_case_core.variables_to_compare)
+        pprint.pprint(f"Expected Globals: {expected_globals_to_compare}")
+        pprint.pprint(f"Actual Globals: {actual_globals_to_compare}")
+    print(f"Equal Outputs: {equal_outputs}")
+    if not equal_outputs:
+        print(f"Expected Output: {expected_output}")
+        print(f"Actual Output: {actual_output}")

From 1f8470697469bd4f76b0b2da995371be3126e714 Mon Sep 17 00:00:00 2001
From: Aaron Diamond-Reivich <aarondr77@gmail.com>
Date: Mon, 13 Jan 2025 14:00:47 -0500
Subject: [PATCH 2/8] evals: fix broken
 complete_condition_from_data_understanding test

---
 .../dataframe_transformation_tests.py                           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
index e5059ac92..a7dd4c187 100644
--- a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
+++ b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
@@ -37,7 +37,7 @@
     InlineCodeCompletionTestCase(
         name="complete_condition_from_data_understanding",
         test_case_core=FILTER_ANNUAL_INCOME_AND_LOAN_CONDITION,
-        prefix="loans_df = loans_df[loans_df['annual_income'] > 100000 & loans_df['loan_condition'] == 'Bad ",
+        prefix="loans_df = loans_df[(loans_df['annual_income'] > 100000) & (loans_df['loan_condition'] == Bad ",
         suffix="",
         type_tags=['code_completion'],
     ),

From eca15f41d3971903e7fac6f38965a677c014d44e Mon Sep 17 00:00:00 2001
From: Aaron Diamond-Reivich <aarondr77@gmail.com>
Date: Mon, 13 Jan 2025 14:21:45 -0500
Subject: [PATCH 3/8] evals: fix separate_data_by_column_value_comment eval

---
 .../dataframe_transformation_tests.py                           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
index a7dd4c187..d80966365 100644
--- a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
+++ b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
@@ -255,7 +255,7 @@
     InlineCodeCompletionTestCase(
         name="separate_data_by_column_value_comment",
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
-        prefix="""# Create a new dataframe for each income category. ie: loans_df_low, loans_df_medium, etc.""",
+        prefix="""# Create a new dataframe for each income category. ie: low_df, medium_df, etc.""",
         suffix="",
         type_tags=['comment_following'],
     ),

From 58a00c65deaed8c309ffe5f0797669195c7d8e81 Mon Sep 17 00:00:00 2001
From: Aaron Diamond-Reivich <aarondr77@gmail.com>
Date: Mon, 13 Jan 2025 14:26:30 -0500
Subject: [PATCH 4/8] evals: fix
 separate_data_by_column_value_prefix_and_suffix and
 separate_data_by_column_value_suffix

---
 .../dataframe_transformation_tests.py                | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
index d80966365..1067a722f 100644
--- a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
+++ b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
@@ -263,8 +263,8 @@
     InlineCodeCompletionTestCase(
         name="separate_data_by_column_value_prefix",
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
-        prefix="""loans_df_low = loans_df[loans_df['income_category'] == 'Low']
-loans_df_medium = loans_df[loans_df['income_category'] == 'Medium']""",
+        prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']
+medium_df = loans_df[loans_df['income_category'] == 'Medium']""",
         suffix="",
         type_tags=['code_completion'],
     ),
@@ -274,17 +274,17 @@
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
         prefix="""""",
         suffix="""
-loans_df_medium = loans_df[loans_df['income_category'] == 'Medium']
-loans_df_high = loans_df[loans_df['income_category'] == 'High']""",
+medium_df = loans_df[loans_df['income_category'] == 'Medium']
+high_df = loans_df[loans_df['income_category'] == 'High']""",
         type_tags=['code_completion'],
     ),
 
     InlineCodeCompletionTestCase(
         name="separate_data_by_column_value_prefix_and_suffix",
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
-        prefix="""loans_df_low = loans_df[loans_df['income_category'] == 'Low']""",
+        prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']""",
         suffix="""
-loans_df_high = loans_df[loans_df['income_category'] == 'High']
+low_df = loans_df[loans_df['income_category'] == 'High']
 """,
         type_tags=['code_completion'],
     ),

From c31188d329f2cb3d5327f49cbcaf599bed5617e8 Mon Sep 17 00:00:00 2001
From: Aaron Diamond-Reivich <aarondr77@gmail.com>
Date: Mon, 13 Jan 2025 14:31:00 -0500
Subject: [PATCH 5/8] evals: fix
 separate_data_by_column_value_prefix_and_suffix eval

---
 .../dataframe_transformation_tests.py                           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
index 1067a722f..966e23c8c 100644
--- a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
+++ b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py
@@ -284,7 +284,7 @@
         test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE,
         prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']""",
         suffix="""
-low_df = loans_df[loans_df['income_category'] == 'High']
+high_df = loans_df[loans_df['income_category'] == 'High']
 """,
         type_tags=['code_completion'],
     ),

From 6fa1af9b84e2ffe1326eb6fa0253b6dcc609620f Mon Sep 17 00:00:00 2001
From: Aaron Diamond-Reivich <aarondr77@gmail.com>
Date: Mon, 13 Jan 2025 14:31:48 -0500
Subject: [PATCH 6/8] evals: improve testing output

---
 evals/test_runners/code_gen_test_runner.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/evals/test_runners/code_gen_test_runner.py b/evals/test_runners/code_gen_test_runner.py
index 27e882a35..269c355af 100644
--- a/evals/test_runners/code_gen_test_runner.py
+++ b/evals/test_runners/code_gen_test_runner.py
@@ -131,11 +131,6 @@ def debug_failed_test_case(
     print(f"Actual Code: {actual_code}")
     print(f"Expected Code: {expected_code}")
     print(f"Equal Globals: {equal_globals}")
-    if not equal_globals:
-        expected_globals_to_compare = get_globals_to_compare(expected_globals, test.test_case_core.variables_to_compare)
-        actual_globals_to_compare = get_globals_to_compare(actual_globals, test.test_case_core.variables_to_compare)
-        pprint.pprint(f"Expected Globals: {expected_globals_to_compare}")
-        pprint.pprint(f"Actual Globals: {actual_globals_to_compare}")
     print(f"Equal Outputs: {equal_outputs}")
     if not equal_outputs:
         print(f"Expected Output: {expected_output}")

From 050f3ff6197deee6046b8b506ee4725d09659d97 Mon Sep 17 00:00:00 2001
From: Aaron Diamond-Reivich <aarondr77@gmail.com>
Date: Mon, 13 Jan 2025 14:34:50 -0500
Subject: [PATCH 7/8] evals: bold text for easier reading

---
 evals/test_runners/code_gen_test_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/test_runners/code_gen_test_runner.py b/evals/test_runners/code_gen_test_runner.py
index 269c355af..f55afcf13 100644
--- a/evals/test_runners/code_gen_test_runner.py
+++ b/evals/test_runners/code_gen_test_runner.py
@@ -65,7 +65,7 @@ def run_code_gen_test(
         test: Union[ChatTestCase, InlineCodeCompletionTestCase], 
         prompt_generator: Union[ChatPromptGenerator, InlineCodeCompletionPromptGenerator]
 ) -> TestCaseResult:
-    print(f"\nRunning test: {test.name}")
+    print(f"\n\033[1mRunning test: {test.name}\033[0m")
 
     # Get the script from the cells
     current_cell_contents_script = get_script_from_cells(test.test_case_core.notebook_state.cell_contents)

From 05f586a110b6865dd4c98b6e49f81d6c338e784b Mon Sep 17 00:00:00 2001
From: Aaron Diamond-Reivich <aarondr77@gmail.com>
Date: Mon, 13 Jan 2025 14:38:05 -0500
Subject: [PATCH 8/8] evals: fix create_my_sum_function_from_comment eval

---
 evals/test_cases/inline_code_completion_tests/function_tests.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/evals/test_cases/inline_code_completion_tests/function_tests.py b/evals/test_cases/inline_code_completion_tests/function_tests.py
index d1c284af3..35a60b664 100644
--- a/evals/test_cases/inline_code_completion_tests/function_tests.py
+++ b/evals/test_cases/inline_code_completion_tests/function_tests.py
@@ -13,6 +13,8 @@
 # Return the sum of two numbers
 def my_sum(a, b):
     return a + b
+    
+x = my_sum(1, 2)
 """,
         ),
         prefix="""# Return the sum of two numbers""",