From 589fbd7f5c3cf07a8e35b436b5f361a4b3b19f02 Mon Sep 17 00:00:00 2001 From: jp Date: Thu, 10 Aug 2023 15:54:06 +0800 Subject: [PATCH] added eval functions, tests, CI, requirements.txt and a guide for contributions --- .github/workflows/main.yml | 26 ++++++ .gitignore | 6 +- CONTRIBUTING.md | 40 +++++++++ README.md | 6 +- eval/eval.py | 140 +++++++++++++++++++++++++++++ requirements.txt | 4 + tests/__init__.py | 0 tests/test_eval.py | 179 +++++++++++++++++++++++++++++++++++++ 8 files changed, 399 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/main.yml create mode 100644 CONTRIBUTING.md create mode 100644 eval/eval.py create mode 100644 requirements.txt create mode 100644 tests/__init__.py create mode 100644 tests/test_eval.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..819727b --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,26 @@ +name: tests + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: psf/black@stable + test: + runs-on: ubuntu-latest + needs: lint + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + - name: Run tests + run: | + pytest tests diff --git a/.gitignore b/.gitignore index 6e7377c..09e6ac3 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ -data/postgres \ No newline at end of file +data/postgres + +# pycache +**/__pycache__/ +.pytest_cache \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..cbb3fc4 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,40 @@ +# Contributing Guidelines + +Thank you for considering contributing to our project! We value your contributions and want to ensure a smooth and collaborative experience for everyone. Please take a moment to review the following guidelines. + +## Table of Contents +- [Linting](#linting) +- [Testing](#testing) +- [Submitting Changes](#submitting-changes) + +## Linting + +We use [black](https://black.readthedocs.io/en/stable/) for code formatting and linting. After installing it via pip, you can automatically lint your code with black by adding it as a pre-commit git hook: +```bash +pip install black +echo -e '#!/bin/sh\n#\n# Run linter before commit\nblack $(git rev-parse --show-toplevel)' > .git/hooks/pre-commit && chmod +x .git/hooks/pre-commit +``` + +## Testing + +[_Quis probabit ipsa probationem?_](https://en.wikipedia.org/wiki/Quis_custodiet_ipsos_custodes%3F) + +We have a comprehensive test suite that ensures the quality and reliability of our codebase. To run the tests, you can use the following command: + +```bash +pytest tests +``` + +Please make sure that all tests pass before submitting your changes. + +## Submitting Changes + +When submitting changes to this repository, please follow these steps: + +- Fork the repository and create a new branch for your changes. +- Make your changes, following the coding style and best practices outlined here. +- Run the tests to ensure your changes don't introduce any regressions. +- Lint your code and [squash your commits](https://www.git-tower.com/learn/git/faq/git-squash) down to 1 single commit. +- Commit your changes and push them to your forked repository. +- Open a pull request to the main repository and provide a detailed description of your changes. +- Your pull request will be reviewed by our team, and we may ask for further improvements or clarifications before merging. Thank you for your contribution! \ No newline at end of file diff --git a/README.md b/README.md index 0cfd53d..2c511d2 100644 --- a/README.md +++ b/README.md @@ -48,4 +48,8 @@ The data for importing is already in the exported sql dumps in the `data/export` ```bash ./data/init_db.sh -``` \ No newline at end of file +``` + +## Misc + +We welcome contributions to our project. Please see [CONTRIBUTING.md](https://github.com/defog-ai/sql-generation-evaluation/blob/main/CONTRIBUTING.md) for more information. \ No newline at end of file diff --git a/eval/eval.py b/eval/eval.py new file mode 100644 index 0000000..b9cfc45 --- /dev/null +++ b/eval/eval.py @@ -0,0 +1,140 @@ +# this file contains all of the helper functions used for evaluations + +import re +from func_timeout import func_timeout +import pandas as pd +from pandas.testing import assert_frame_equal, assert_series_equal +from sqlalchemy import create_engine + +# like_pattern = r"LIKE\s+'[^']*'" +like_pattern = r"LIKE[\s\S]*'" + + +def normalize_table( + df: pd.DataFrame, query_category: str, question: str +) -> pd.DataFrame: + """ + Normalizes a dataframe by: + 1. sorting columns in alphabetical order + 2. sorting rows using values from first column to last (if query_category is not 'order_by' and question does not ask for ordering) + 3. resetting index + """ + # sort columns in alphabetical order + sorted_df = df.reindex(sorted(df.columns), axis=1) + + # check if query_category is 'order_by' and if question asks for ordering + has_order_by = False + pattern = re.compile(r"(order|sort|arrange)", re.IGNORECASE) + in_question = re.search(pattern, question.lower()) # true if contains + if query_category == "order_by" or in_question: + has_order_by = True + if not has_order_by: + # sort rows using values from first column to last + sorted_df = sorted_df.sort_values(by=list(sorted_df.columns)) + # reset index + sorted_df = sorted_df.reset_index(drop=True) + return sorted_df + + +# for escaping percent signs in regex matches +def escape_percent(match): + # Extract the matched group + group = match.group(0) + # Replace '%' with '%%' within the matched group + escaped_group = group.replace("%", "%%") + # Return the escaped group + return escaped_group + + +def query_postgres_db( + query: str, db_name: str, db_creds: dict, timeout: float +) -> pd.DataFrame: + """ + Runs query on postgres db and returns results as a dataframe. + This assumes that you have the evaluation database running locally. + If you don't, you can following the instructions in the README (Restoring to Postgres) to set it up. + + timeout: time in seconds to wait for query to finish before timing out + """ + try: + db_url = f"postgresql://{db_creds['user']}:{db_creds['password']}@{db_creds['host']}:{db_creds['port']}/{db_name}" + engine = create_engine(db_url) + escaped_query = re.sub( + like_pattern, escape_percent, query, flags=re.IGNORECASE + ) # ignore case of LIKE + results_df = func_timeout( + timeout, pd.read_sql_query, args=(escaped_query, engine) + ) + engine.dispose() # close connection + return results_df + except Exception as e: + if engine: + engine.dispose() # close connection if query fails/timeouts + raise e + + +def compare_df( + df1: pd.DataFrame, df2: pd.DataFrame, query_category: str, question: str +) -> bool: + """ + Compares two dataframes and returns True if they are the same, else False. + """ + df1 = normalize_table(df1, query_category, question) + df2 = normalize_table(df2, query_category, question) + try: + assert_frame_equal(df1, df2, check_dtype=False) # handles dtype mismatches + except AssertionError: + return False + return True + + +def subset_df( + df_sub: pd.DataFrame, + df_super: pd.DataFrame, + query_category: str, + question: str, + verbose: bool = False, +) -> bool: + """ + Checks if df_sub is a subset of df_super + """ + if df_sub.empty: + return True # trivial case + # make a copy of df_super so we don't modify the original while keeping track of matches + df_super_temp = df_super.copy(deep=True) + matched_columns = [] + for col_sub_name in df_sub.columns: + col_match = False + for col_super_name in df_super_temp.columns: + col_sub = df_sub[col_sub_name].sort_values().reset_index(drop=True) + col_super = ( + df_super_temp[col_super_name].sort_values().reset_index(drop=True) + ) + try: + assert_series_equal( + col_sub, col_super, check_dtype=False, check_names=False + ) + col_match = True + matched_columns.append(col_super_name) + # remove col_super_name to prevent us from matching it again + df_super_temp = df_super_temp.drop(columns=[col_super_name]) + break + except AssertionError: + continue + if col_match == False: + if verbose: + print(f"no match for {col_sub_name}") + return False + df_sub_normalized = normalize_table(df_sub, query_category, question) + + # get matched columns from df_super, and rename them with columns from df_sub, then normalize + df_super_matched = df_super[matched_columns].rename( + columns=dict(zip(matched_columns, df_sub.columns)) + ) + df_super_matched = normalize_table(df_super_matched, query_category, question) + + try: + assert_frame_equal(df_sub_normalized, df_super_matched, check_dtype=False) + return True + except AssertionError: + return False diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6598cf8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +func_timeout +pandas +pytest +sqlalchemy \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_eval.py b/tests/test_eval.py new file mode 100644 index 0000000..05e0c12 --- /dev/null +++ b/tests/test_eval.py @@ -0,0 +1,179 @@ +import pandas as pd +from pandas.testing import assert_frame_equal +import pytest +from eval.eval import normalize_table, compare_df, subset_df + +query = "SELECT * FROM table_name" +query_order_by = "SELECT * FROM table_name ORDER BY name DESC" + + +@pytest.fixture +def unordered_dataframe(): + # Create a sample DataFrame for testing + data = { + "name": ["John", "Jane", "Alice"], + "age": [25, 30, 35], + "city": ["New York", "London", "Paris"], + } + return pd.DataFrame(data) + + +@pytest.fixture +def test_dataframes(): + df0 = pd.DataFrame({"A": [], "B": []}) + df0_same = pd.DataFrame({"A": [], "B": []}) + df1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df1_same = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + df1_value_diff = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) + df1_columns_reordered = pd.DataFrame({"B": [4, 5, 6], "A": [1, 2, 3]}) + df1_columns_diffcase = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df1_columns_renamed = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]}) + df1_rows_reordered = pd.DataFrame({"A": [2, 1, 3], "B": [5, 4, 6]}) + df1_rows_reordered_columns_renamed = pd.DataFrame({"X": [2, 1, 3], "Y": [5, 4, 6]}) + df1_rows_reordered_more_cols = pd.DataFrame( + {"X": [2, 1, 3], "Y": [5, 4, 6], "Z": [7, 8, 9]} + ) + return ( + df0, + df0_same, + df1, + df1_same, + df1_value_diff, + df1_columns_reordered, + df1_columns_diffcase, + df1_columns_renamed, + df1_rows_reordered, + df1_rows_reordered_columns_renamed, + df1_rows_reordered_more_cols, + ) + + +def test_normalize_table_no_order_by(unordered_dataframe): + # Test normalization without an order by clause + expected_df = pd.DataFrame( + { + "age": [25, 30, 35], + "city": ["New York", "London", "Paris"], + "name": ["John", "Jane", "Alice"], + } + ) + question = "What is the average age of the people in the table?" + normalized_df = normalize_table(unordered_dataframe, query, question) + assert_frame_equal(expected_df, normalized_df) + + +def test_normalize_table_with_order_by(unordered_dataframe): + # Test normalization with an order by clause + expected_df = pd.DataFrame( + { + "age": [25, 30, 35], + "city": ["New York", "London", "Paris"], + "name": ["John", "Jane", "Alice"], + } + ) + question_sort = "What is the average age of the people in the table? sort by name." + normalized_df = normalize_table(unordered_dataframe, query_order_by, question_sort) + + assert_frame_equal(expected_df, normalized_df) + + +def test_compare_df(test_dataframes): + # Assigning the test_dataframes fixture to individual variables + ( + df0, + df0_same, + df1, + df1_same, + df1_value_diff, + df1_columns_reordered, + df1_columns_diffcase, + df1_columns_renamed, + df1_rows_reordered, + df1_rows_reordered_columns_renamed, + df1_rows_reordered_more_cols, + ) = test_dataframes + + question = "What is the average age of the people in the table?" + question_sort = "What is the average age of the people in the table? sort by name." + + # Test case 1: Empty DataFrames, expect True + assert compare_df(df0, df0_same, query, question) == True + + # Test case 2: Identical DataFrames, expect True + assert compare_df(df1, df1_same, query, question) == True + + # Test case 3: Value Difference in a Column, expect False + assert compare_df(df1, df1_value_diff, query, question) == False + + # Test case 4: Reordered Columns, expect True + assert compare_df(df1, df1_columns_reordered, query, question) == True + + # Test case 5: Different Case in Column Names, assume already done so False + assert compare_df(df1, df1_columns_diffcase, query, question) == False + + # Test case 6: Renamed Columns, expect False + assert compare_df(df1, df1_columns_renamed, query, question) == False + + # Test case 7: Reordered Rows, expect True + assert compare_df(df1, df1_rows_reordered, query, question) == True + + # Test case 8: Reordered Rows with specific ordering, expect False + assert compare_df(df1, df1_rows_reordered, query_order_by, question_sort) == False + + # Test case 9: Reordered Rows with specific ordering and renamed columns, expect False + assert ( + compare_df(df1, df1_rows_reordered_columns_renamed, query, question) + ) == False + + # Test case 10: Reordered Rows with specific ordering and renamed and additional columns, expect False + assert (compare_df(df1, df1_rows_reordered_more_cols, query, question)) == False + + +def test_subset_df(test_dataframes): + # Assigning the test_dataframes fixture to individual variables + ( + df0, + df0_same, + df1, + df1_same, + df1_value_diff, + df1_columns_reordered, + df1_columns_diffcase, + df1_columns_renamed, + df1_rows_reordered, + df1_rows_reordered_columns_renamed, + df1_rows_reordered_more_cols, + ) = test_dataframes + + question = "What is the average age of the people in the table?" + question_sort = "What is the average age of the people in the table? sort by name." + + # Test case 1: Empty DataFrames + assert subset_df(df0, df0_same, query, question) == True + + # Test case 2: Identical DataFrames + assert subset_df(df1, df1_same, query, question) == True + + # Test case 3: Value Difference in a Column + assert subset_df(df1, df1_value_diff, query, question) == False + + # Test case 4: Reordered Columns + assert subset_df(df1, df1_columns_reordered, query, question) == True + + # Test case 5: Different Case in Column Names + assert subset_df(df1, df1_columns_diffcase, query, question) == True + + # Test case 6: Renamed Columns + assert subset_df(df1, df1_columns_renamed, query, question) == True + + # Test case 7: Reordered Rows + assert subset_df(df1, df1_rows_reordered, query, question) == True + + # Test case 8: Reordered Rows with specific ordering, expect False + assert subset_df(df1, df1_rows_reordered, query_order_by, question_sort) == False + + # Test case 9: Reordered Rows with specific ordering and renamed columns, expect True + assert (subset_df(df1, df1_rows_reordered_columns_renamed, query, question)) == True + + # Test case 10: Reordered Rows with specific ordering and renamed and additional columns, expect True + assert (subset_df(df1, df1_rows_reordered_more_cols, query, question)) == True