From 589fbd7f5c3cf07a8e35b436b5f361a4b3b19f02 Mon Sep 17 00:00:00 2001
From: jp <wongjingping@gmail.com>
Date: Thu, 10 Aug 2023 15:54:06 +0800
Subject: [PATCH] added eval functions, tests, CI, requirements.txt and a guide
 for contributions

---
 .github/workflows/main.yml |  26 ++++++
 .gitignore                 |   6 +-
 CONTRIBUTING.md            |  40 +++++++++
 README.md                  |   6 +-
 eval/eval.py               | 140 +++++++++++++++++++++++++++++
 requirements.txt           |   4 +
 tests/__init__.py          |   0
 tests/test_eval.py         | 179 +++++++++++++++++++++++++++++++++++++
 8 files changed, 399 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/main.yml
 create mode 100644 CONTRIBUTING.md
 create mode 100644 eval/eval.py
 create mode 100644 requirements.txt
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_eval.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 0000000..819727b
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,26 @@
+name: tests
+
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: psf/black@stable
+  test:
+    runs-on: ubuntu-latest
+    needs: lint
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.9'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Run tests
+        run: |
+          pytest tests
diff --git a/.gitignore b/.gitignore
index 6e7377c..09e6ac3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,5 @@
-data/postgres
\ No newline at end of file
+data/postgres
+
+# pycache
+**/__pycache__/
+.pytest_cache
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..cbb3fc4
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,40 @@
+# Contributing Guidelines
+
+Thank you for considering contributing to our project! We value your contributions and want to ensure a smooth and collaborative experience for everyone. Please take a moment to review the following guidelines.
+
+## Table of Contents
+- [Linting](#linting)
+- [Testing](#testing)
+- [Submitting Changes](#submitting-changes)
+
+## Linting
+
+We use [black](https://black.readthedocs.io/en/stable/) for code formatting and linting. After installing it via pip, you can automatically lint your code with black by adding it as a pre-commit git hook:
+```bash
+pip install black
+echo -e '#!/bin/sh\n#\n# Run linter before commit\nblack $(git rev-parse --show-toplevel)' > .git/hooks/pre-commit && chmod +x .git/hooks/pre-commit
+```
+
+## Testing
+
+[_Quis probabit ipsa probationem?_](https://en.wikipedia.org/wiki/Quis_custodiet_ipsos_custodes%3F)
+
+We have a comprehensive test suite that ensures the quality and reliability of our codebase. To run the tests, you can use the following command:
+
+```bash
+pytest tests
+```
+
+Please make sure that all tests pass before submitting your changes.
+
+## Submitting Changes
+
+When submitting changes to this repository, please follow these steps:
+
+- Fork the repository and create a new branch for your changes.
+- Make your changes, following the coding style and best practices outlined here.
+- Run the tests to ensure your changes don't introduce any regressions.
+- Lint your code and [squash your commits](https://www.git-tower.com/learn/git/faq/git-squash) down to 1 single commit.
+- Commit your changes and push them to your forked repository.
+- Open a pull request to the main repository and provide a detailed description of your changes.
+- Your pull request will be reviewed by our team, and we may ask for further improvements or clarifications before merging. Thank you for your contribution!
\ No newline at end of file
diff --git a/README.md b/README.md
index 0cfd53d..2c511d2 100644
--- a/README.md
+++ b/README.md
@@ -48,4 +48,8 @@ The data for importing is already in the exported sql dumps in the `data/export`
 
 ```bash
 ./data/init_db.sh
-```
\ No newline at end of file
+```
+
+## Misc
+
+We welcome contributions to our project. Please see [CONTRIBUTING.md](https://github.com/defog-ai/sql-generation-evaluation/blob/main/CONTRIBUTING.md) for more information.
\ No newline at end of file
diff --git a/eval/eval.py b/eval/eval.py
new file mode 100644
index 0000000..b9cfc45
--- /dev/null
+++ b/eval/eval.py
@@ -0,0 +1,140 @@
+# this file contains all of the helper functions used for evaluations
+
+import re
+from func_timeout import func_timeout
+import pandas as pd
+from pandas.testing import assert_frame_equal, assert_series_equal
+from sqlalchemy import create_engine
+
+# like_pattern = r"LIKE\s+'[^']*'"
+like_pattern = r"LIKE[\s\S]*'"
+
+
+def normalize_table(
+    df: pd.DataFrame, query_category: str, question: str
+) -> pd.DataFrame:
+    """
+    Normalizes a dataframe by:
+    1. sorting columns in alphabetical order
+    2. sorting rows using values from first column to last (if query_category is not 'order_by' and question does not ask for ordering)
+    3. resetting index
+    """
+    # sort columns in alphabetical order
+    sorted_df = df.reindex(sorted(df.columns), axis=1)
+
+    # check if query_category is 'order_by' and if question asks for ordering
+    has_order_by = False
+    pattern = re.compile(r"(order|sort|arrange)", re.IGNORECASE)
+    in_question = re.search(pattern, question.lower())  # true if contains
+    if query_category == "order_by" or in_question:
+        has_order_by = True
+    if not has_order_by:
+        # sort rows using values from first column to last
+        sorted_df = sorted_df.sort_values(by=list(sorted_df.columns))
+    # reset index
+    sorted_df = sorted_df.reset_index(drop=True)
+    return sorted_df
+
+
+# for escaping percent signs in regex matches
+def escape_percent(match):
+    # Extract the matched group
+    group = match.group(0)
+    # Replace '%' with '%%' within the matched group
+    escaped_group = group.replace("%", "%%")
+    # Return the escaped group
+    return escaped_group
+
+
+def query_postgres_db(
+    query: str, db_name: str, db_creds: dict, timeout: float
+) -> pd.DataFrame:
+    """
+    Runs query on postgres db and returns results as a dataframe.
+    This assumes that you have the evaluation database running locally.
+    If you don't, you can following the instructions in the README (Restoring to Postgres) to set it up.
+
+    timeout: time in seconds to wait for query to finish before timing out
+    """
+    try:
+        db_url = f"postgresql://{db_creds['user']}:{db_creds['password']}@{db_creds['host']}:{db_creds['port']}/{db_name}"
+        engine = create_engine(db_url)
+        escaped_query = re.sub(
+            like_pattern, escape_percent, query, flags=re.IGNORECASE
+        )  # ignore case of LIKE
+        results_df = func_timeout(
+            timeout, pd.read_sql_query, args=(escaped_query, engine)
+        )
+        engine.dispose()  # close connection
+        return results_df
+    except Exception as e:
+        if engine:
+            engine.dispose()  # close connection if query fails/timeouts
+        raise e
+
+
+def compare_df(
+    df1: pd.DataFrame, df2: pd.DataFrame, query_category: str, question: str
+) -> bool:
+    """
+    Compares two dataframes and returns True if they are the same, else False.
+    """
+    df1 = normalize_table(df1, query_category, question)
+    df2 = normalize_table(df2, query_category, question)
+    try:
+        assert_frame_equal(df1, df2, check_dtype=False)  # handles dtype mismatches
+    except AssertionError:
+        return False
+    return True
+
+
+def subset_df(
+    df_sub: pd.DataFrame,
+    df_super: pd.DataFrame,
+    query_category: str,
+    question: str,
+    verbose: bool = False,
+) -> bool:
+    """
+    Checks if df_sub is a subset of df_super
+    """
+    if df_sub.empty:
+        return True  # trivial case
+    # make a copy of df_super so we don't modify the original while keeping track of matches
+    df_super_temp = df_super.copy(deep=True)
+    matched_columns = []
+    for col_sub_name in df_sub.columns:
+        col_match = False
+        for col_super_name in df_super_temp.columns:
+            col_sub = df_sub[col_sub_name].sort_values().reset_index(drop=True)
+            col_super = (
+                df_super_temp[col_super_name].sort_values().reset_index(drop=True)
+            )
+            try:
+                assert_series_equal(
+                    col_sub, col_super, check_dtype=False, check_names=False
+                )
+                col_match = True
+                matched_columns.append(col_super_name)
+                # remove col_super_name to prevent us from matching it again
+                df_super_temp = df_super_temp.drop(columns=[col_super_name])
+                break
+            except AssertionError:
+                continue
+        if col_match == False:
+            if verbose:
+                print(f"no match for {col_sub_name}")
+            return False
+    df_sub_normalized = normalize_table(df_sub, query_category, question)
+
+    # get matched columns from df_super, and rename them with columns from df_sub, then normalize
+    df_super_matched = df_super[matched_columns].rename(
+        columns=dict(zip(matched_columns, df_sub.columns))
+    )
+    df_super_matched = normalize_table(df_super_matched, query_category, question)
+
+    try:
+        assert_frame_equal(df_sub_normalized, df_super_matched, check_dtype=False)
+        return True
+    except AssertionError:
+        return False
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..6598cf8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+func_timeout
+pandas
+pytest
+sqlalchemy
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_eval.py b/tests/test_eval.py
new file mode 100644
index 0000000..05e0c12
--- /dev/null
+++ b/tests/test_eval.py
@@ -0,0 +1,179 @@
+import pandas as pd
+from pandas.testing import assert_frame_equal
+import pytest
+from eval.eval import normalize_table, compare_df, subset_df
+
+query = "SELECT * FROM table_name"
+query_order_by = "SELECT * FROM table_name ORDER BY name DESC"
+
+
+@pytest.fixture
+def unordered_dataframe():
+    # Create a sample DataFrame for testing
+    data = {
+        "name": ["John", "Jane", "Alice"],
+        "age": [25, 30, 35],
+        "city": ["New York", "London", "Paris"],
+    }
+    return pd.DataFrame(data)
+
+
+@pytest.fixture
+def test_dataframes():
+    df0 = pd.DataFrame({"A": [], "B": []})
+    df0_same = pd.DataFrame({"A": [], "B": []})
+    df1 = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+    df1_same = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+    df1_value_diff = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]})
+    df1_columns_reordered = pd.DataFrame({"B": [4, 5, 6], "A": [1, 2, 3]})
+    df1_columns_diffcase = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df1_columns_renamed = pd.DataFrame({"C": [1, 2, 3], "D": [4, 5, 6]})
+    df1_rows_reordered = pd.DataFrame({"A": [2, 1, 3], "B": [5, 4, 6]})
+    df1_rows_reordered_columns_renamed = pd.DataFrame({"X": [2, 1, 3], "Y": [5, 4, 6]})
+    df1_rows_reordered_more_cols = pd.DataFrame(
+        {"X": [2, 1, 3], "Y": [5, 4, 6], "Z": [7, 8, 9]}
+    )
+    return (
+        df0,
+        df0_same,
+        df1,
+        df1_same,
+        df1_value_diff,
+        df1_columns_reordered,
+        df1_columns_diffcase,
+        df1_columns_renamed,
+        df1_rows_reordered,
+        df1_rows_reordered_columns_renamed,
+        df1_rows_reordered_more_cols,
+    )
+
+
+def test_normalize_table_no_order_by(unordered_dataframe):
+    # Test normalization without an order by clause
+    expected_df = pd.DataFrame(
+        {
+            "age": [25, 30, 35],
+            "city": ["New York", "London", "Paris"],
+            "name": ["John", "Jane", "Alice"],
+        }
+    )
+    question = "What is the average age of the people in the table?"
+    normalized_df = normalize_table(unordered_dataframe, query, question)
+    assert_frame_equal(expected_df, normalized_df)
+
+
+def test_normalize_table_with_order_by(unordered_dataframe):
+    # Test normalization with an order by clause
+    expected_df = pd.DataFrame(
+        {
+            "age": [25, 30, 35],
+            "city": ["New York", "London", "Paris"],
+            "name": ["John", "Jane", "Alice"],
+        }
+    )
+    question_sort = "What is the average age of the people in the table? sort by name."
+    normalized_df = normalize_table(unordered_dataframe, query_order_by, question_sort)
+
+    assert_frame_equal(expected_df, normalized_df)
+
+
+def test_compare_df(test_dataframes):
+    # Assigning the test_dataframes fixture to individual variables
+    (
+        df0,
+        df0_same,
+        df1,
+        df1_same,
+        df1_value_diff,
+        df1_columns_reordered,
+        df1_columns_diffcase,
+        df1_columns_renamed,
+        df1_rows_reordered,
+        df1_rows_reordered_columns_renamed,
+        df1_rows_reordered_more_cols,
+    ) = test_dataframes
+
+    question = "What is the average age of the people in the table?"
+    question_sort = "What is the average age of the people in the table? sort by name."
+
+    # Test case 1: Empty DataFrames, expect True
+    assert compare_df(df0, df0_same, query, question) == True
+
+    # Test case 2: Identical DataFrames, expect True
+    assert compare_df(df1, df1_same, query, question) == True
+
+    # Test case 3: Value Difference in a Column, expect False
+    assert compare_df(df1, df1_value_diff, query, question) == False
+
+    # Test case 4: Reordered Columns, expect True
+    assert compare_df(df1, df1_columns_reordered, query, question) == True
+
+    # Test case 5: Different Case in Column Names, assume already done so False
+    assert compare_df(df1, df1_columns_diffcase, query, question) == False
+
+    # Test case 6: Renamed Columns, expect False
+    assert compare_df(df1, df1_columns_renamed, query, question) == False
+
+    # Test case 7: Reordered Rows, expect True
+    assert compare_df(df1, df1_rows_reordered, query, question) == True
+
+    # Test case 8: Reordered Rows with specific ordering, expect False
+    assert compare_df(df1, df1_rows_reordered, query_order_by, question_sort) == False
+
+    # Test case 9: Reordered Rows with specific ordering and renamed columns, expect False
+    assert (
+        compare_df(df1, df1_rows_reordered_columns_renamed, query, question)
+    ) == False
+
+    # Test case 10: Reordered Rows with specific ordering and renamed and additional columns, expect False
+    assert (compare_df(df1, df1_rows_reordered_more_cols, query, question)) == False
+
+
+def test_subset_df(test_dataframes):
+    # Assigning the test_dataframes fixture to individual variables
+    (
+        df0,
+        df0_same,
+        df1,
+        df1_same,
+        df1_value_diff,
+        df1_columns_reordered,
+        df1_columns_diffcase,
+        df1_columns_renamed,
+        df1_rows_reordered,
+        df1_rows_reordered_columns_renamed,
+        df1_rows_reordered_more_cols,
+    ) = test_dataframes
+
+    question = "What is the average age of the people in the table?"
+    question_sort = "What is the average age of the people in the table? sort by name."
+
+    # Test case 1: Empty DataFrames
+    assert subset_df(df0, df0_same, query, question) == True
+
+    # Test case 2: Identical DataFrames
+    assert subset_df(df1, df1_same, query, question) == True
+
+    # Test case 3: Value Difference in a Column
+    assert subset_df(df1, df1_value_diff, query, question) == False
+
+    # Test case 4: Reordered Columns
+    assert subset_df(df1, df1_columns_reordered, query, question) == True
+
+    # Test case 5: Different Case in Column Names
+    assert subset_df(df1, df1_columns_diffcase, query, question) == True
+
+    # Test case 6: Renamed Columns
+    assert subset_df(df1, df1_columns_renamed, query, question) == True
+
+    # Test case 7: Reordered Rows
+    assert subset_df(df1, df1_rows_reordered, query, question) == True
+
+    # Test case 8: Reordered Rows with specific ordering, expect False
+    assert subset_df(df1, df1_rows_reordered, query_order_by, question_sort) == False
+
+    # Test case 9: Reordered Rows with specific ordering and renamed columns, expect True
+    assert (subset_df(df1, df1_rows_reordered_columns_renamed, query, question)) == True
+
+    # Test case 10: Reordered Rows with specific ordering and renamed and additional columns, expect True
+    assert (subset_df(df1, df1_rows_reordered_more_cols, query, question)) == True