From bcc46201b7a8f1c47468a1cb35404fe55a4f796a Mon Sep 17 00:00:00 2001 From: Aaron Diamond-Reivich Date: Mon, 13 Jan 2025 14:00:30 -0500 Subject: [PATCH 1/8] evals: improve output for evals understanding --- evals/test_runners/code_gen_test_runner.py | 40 ++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/evals/test_runners/code_gen_test_runner.py b/evals/test_runners/code_gen_test_runner.py index 980f0b069..27e882a35 100644 --- a/evals/test_runners/code_gen_test_runner.py +++ b/evals/test_runners/code_gen_test_runner.py @@ -1,3 +1,4 @@ +import pprint from typing import Dict, List, Literal, Optional, Union from evals.ai_api_calls.get_open_ai_completion import get_open_ai_completion from evals.asserts.equal_outputs import assert_equal_outputs @@ -8,12 +9,11 @@ from evals.test_cases.inline_code_completion_tests import INLINE_CODE_COMPLETION_TESTS from evals.test_runners.utils import exec_code_and_get_globals_and_output from evals.utils import get_script_from_cells, print_test_case_result_tables -from evals.asserts.equal_globals import assert_equal_globals +from evals.asserts.equal_globals import assert_equal_globals, get_globals_to_compare def run_chat_tests(test_name: Optional[str], prompt_name: Optional[str], tags: Optional[List[str]]): _run_code_gen_tests('chat', CHAT_TESTS, CHAT_PROMPT_GENERATORS, test_name, prompt_name, tags) - def run_inline_code_completion_tests(test_name: Optional[str], prompt_name: Optional[str], tags: Optional[List[str]]): _run_code_gen_tests('inline_code_completion', INLINE_CODE_COMPLETION_TESTS, INLINE_CODE_COMPLETION_PROMPT_GENERATORS, test_name, prompt_name, tags) @@ -65,7 +65,7 @@ def run_code_gen_test( test: Union[ChatTestCase, InlineCodeCompletionTestCase], prompt_generator: Union[ChatPromptGenerator, InlineCodeCompletionPromptGenerator] ) -> TestCaseResult: - print(f"Running test: {test.name}") + print(f"\nRunning test: {test.name}") # Get the script from the cells current_cell_contents_script = get_script_from_cells(test.test_case_core.notebook_state.cell_contents) @@ -97,6 +97,9 @@ def run_code_gen_test( except Exception as e: # Fail early if we can't execute the code print(f"Failed to execute code with error: {e}") + print(f"AI Generated Code: {ai_generated_code}") + print(f"Actual Code: {actual_code}") + print(f"Expected Code: {expected_code}") return TestCaseResult(test=test, passed=False) equal_globals = assert_equal_globals(expected_globals, actual_globals, test.test_case_core.variables_to_compare) @@ -104,5 +107,36 @@ def run_code_gen_test( passed = equal_globals and equal_outputs + if not passed: + debug_failed_test_case(test, ai_generated_code, actual_code, expected_code, equal_globals, equal_outputs, expected_globals, actual_globals, expected_output, actual_output) + return TestCaseResult(test=test, passed=passed) + +def debug_failed_test_case( + test: Union[ChatTestCase, InlineCodeCompletionTestCase], + ai_generated_code: str, + actual_code: str, + expected_code: str, + equal_globals: bool, + equal_outputs: bool, + expected_globals: Dict[str, str], + actual_globals: Dict[str, str], + expected_output: str, + actual_output: str, + ) -> None: + + + print(f"AI Generated Code: {ai_generated_code}") + print(f"Actual Code: {actual_code}") + print(f"Expected Code: {expected_code}") + print(f"Equal Globals: {equal_globals}") + if not equal_globals: + expected_globals_to_compare = get_globals_to_compare(expected_globals, test.test_case_core.variables_to_compare) + actual_globals_to_compare = get_globals_to_compare(actual_globals, test.test_case_core.variables_to_compare) + pprint.pprint(f"Expected Globals: {expected_globals_to_compare}") + pprint.pprint(f"Actual Globals: {actual_globals_to_compare}") + print(f"Equal Outputs: {equal_outputs}") + if not equal_outputs: + print(f"Expected Output: {expected_output}") + print(f"Actual Output: {actual_output}") From 1f8470697469bd4f76b0b2da995371be3126e714 Mon Sep 17 00:00:00 2001 From: Aaron Diamond-Reivich Date: Mon, 13 Jan 2025 14:00:47 -0500 Subject: [PATCH 2/8] evals: fix broken complete_condition_from_data_understanding test --- .../dataframe_transformation_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py index e5059ac92..a7dd4c187 100644 --- a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py +++ b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py @@ -37,7 +37,7 @@ InlineCodeCompletionTestCase( name="complete_condition_from_data_understanding", test_case_core=FILTER_ANNUAL_INCOME_AND_LOAN_CONDITION, - prefix="loans_df = loans_df[loans_df['annual_income'] > 100000 & loans_df['loan_condition'] == 'Bad ", + prefix="loans_df = loans_df[(loans_df['annual_income'] > 100000) & (loans_df['loan_condition'] == Bad ", suffix="", type_tags=['code_completion'], ), From eca15f41d3971903e7fac6f38965a677c014d44e Mon Sep 17 00:00:00 2001 From: Aaron Diamond-Reivich Date: Mon, 13 Jan 2025 14:21:45 -0500 Subject: [PATCH 3/8] evals: fix separate_data_by_column_value_comment eval --- .../dataframe_transformation_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py index a7dd4c187..d80966365 100644 --- a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py +++ b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py @@ -255,7 +255,7 @@ InlineCodeCompletionTestCase( name="separate_data_by_column_value_comment", test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE, - prefix="""# Create a new dataframe for each income category. ie: loans_df_low, loans_df_medium, etc.""", + prefix="""# Create a new dataframe for each income category. ie: low_df, medium_df, etc.""", suffix="", type_tags=['comment_following'], ), From 58a00c65deaed8c309ffe5f0797669195c7d8e81 Mon Sep 17 00:00:00 2001 From: Aaron Diamond-Reivich Date: Mon, 13 Jan 2025 14:26:30 -0500 Subject: [PATCH 4/8] evals: fix separate_data_by_column_value_prefix_and_suffix and separate_data_by_column_value_suffix --- .../dataframe_transformation_tests.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py index d80966365..1067a722f 100644 --- a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py +++ b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py @@ -263,8 +263,8 @@ InlineCodeCompletionTestCase( name="separate_data_by_column_value_prefix", test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE, - prefix="""loans_df_low = loans_df[loans_df['income_category'] == 'Low'] -loans_df_medium = loans_df[loans_df['income_category'] == 'Medium']""", + prefix="""low_df = loans_df[loans_df['income_category'] == 'Low'] +medium_df = loans_df[loans_df['income_category'] == 'Medium']""", suffix="", type_tags=['code_completion'], ), @@ -274,17 +274,17 @@ test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE, prefix="""""", suffix=""" -loans_df_medium = loans_df[loans_df['income_category'] == 'Medium'] -loans_df_high = loans_df[loans_df['income_category'] == 'High']""", +medium_df = loans_df[loans_df['income_category'] == 'Medium'] +high_df = loans_df[loans_df['income_category'] == 'High']""", type_tags=['code_completion'], ), InlineCodeCompletionTestCase( name="separate_data_by_column_value_prefix_and_suffix", test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE, - prefix="""loans_df_low = loans_df[loans_df['income_category'] == 'Low']""", + prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']""", suffix=""" -loans_df_high = loans_df[loans_df['income_category'] == 'High'] +low_df = loans_df[loans_df['income_category'] == 'High'] """, type_tags=['code_completion'], ), From c31188d329f2cb3d5327f49cbcaf599bed5617e8 Mon Sep 17 00:00:00 2001 From: Aaron Diamond-Reivich Date: Mon, 13 Jan 2025 14:31:00 -0500 Subject: [PATCH 5/8] evals: fix separate_data_by_column_value_prefix_and_suffix eval --- .../dataframe_transformation_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py index 1067a722f..966e23c8c 100644 --- a/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py +++ b/evals/test_cases/inline_code_completion_tests/dataframe_transformation_tests.py @@ -284,7 +284,7 @@ test_case_core=SEPARATE_DATA_BY_COLUMN_VALUE, prefix="""low_df = loans_df[loans_df['income_category'] == 'Low']""", suffix=""" -low_df = loans_df[loans_df['income_category'] == 'High'] +high_df = loans_df[loans_df['income_category'] == 'High'] """, type_tags=['code_completion'], ), From 6fa1af9b84e2ffe1326eb6fa0253b6dcc609620f Mon Sep 17 00:00:00 2001 From: Aaron Diamond-Reivich Date: Mon, 13 Jan 2025 14:31:48 -0500 Subject: [PATCH 6/8] evals: improve testing output --- evals/test_runners/code_gen_test_runner.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/evals/test_runners/code_gen_test_runner.py b/evals/test_runners/code_gen_test_runner.py index 27e882a35..269c355af 100644 --- a/evals/test_runners/code_gen_test_runner.py +++ b/evals/test_runners/code_gen_test_runner.py @@ -131,11 +131,6 @@ def debug_failed_test_case( print(f"Actual Code: {actual_code}") print(f"Expected Code: {expected_code}") print(f"Equal Globals: {equal_globals}") - if not equal_globals: - expected_globals_to_compare = get_globals_to_compare(expected_globals, test.test_case_core.variables_to_compare) - actual_globals_to_compare = get_globals_to_compare(actual_globals, test.test_case_core.variables_to_compare) - pprint.pprint(f"Expected Globals: {expected_globals_to_compare}") - pprint.pprint(f"Actual Globals: {actual_globals_to_compare}") print(f"Equal Outputs: {equal_outputs}") if not equal_outputs: print(f"Expected Output: {expected_output}") From 050f3ff6197deee6046b8b506ee4725d09659d97 Mon Sep 17 00:00:00 2001 From: Aaron Diamond-Reivich Date: Mon, 13 Jan 2025 14:34:50 -0500 Subject: [PATCH 7/8] evals: bold text for easier reading --- evals/test_runners/code_gen_test_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/test_runners/code_gen_test_runner.py b/evals/test_runners/code_gen_test_runner.py index 269c355af..f55afcf13 100644 --- a/evals/test_runners/code_gen_test_runner.py +++ b/evals/test_runners/code_gen_test_runner.py @@ -65,7 +65,7 @@ def run_code_gen_test( test: Union[ChatTestCase, InlineCodeCompletionTestCase], prompt_generator: Union[ChatPromptGenerator, InlineCodeCompletionPromptGenerator] ) -> TestCaseResult: - print(f"\nRunning test: {test.name}") + print(f"\n\033[1mRunning test: {test.name}\033[0m") # Get the script from the cells current_cell_contents_script = get_script_from_cells(test.test_case_core.notebook_state.cell_contents) From 05f586a110b6865dd4c98b6e49f81d6c338e784b Mon Sep 17 00:00:00 2001 From: Aaron Diamond-Reivich Date: Mon, 13 Jan 2025 14:38:05 -0500 Subject: [PATCH 8/8] evals: fix create_my_sum_function_from_comment eval --- evals/test_cases/inline_code_completion_tests/function_tests.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evals/test_cases/inline_code_completion_tests/function_tests.py b/evals/test_cases/inline_code_completion_tests/function_tests.py index d1c284af3..35a60b664 100644 --- a/evals/test_cases/inline_code_completion_tests/function_tests.py +++ b/evals/test_cases/inline_code_completion_tests/function_tests.py @@ -13,6 +13,8 @@ # Return the sum of two numbers def my_sum(a, b): return a + b + +x = my_sum(1, 2) """, ), prefix="""# Return the sum of two numbers""",