Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bugs and add requirements checks #109

Merged
merged 12 commits into from
Mar 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,12 @@ jobs:
- name: Python Black Check
run: |
black --line-length=120 --check df_to_azure
- name: assert equality between setup.cfg and requirements.txt
uses: actions/checkout@v2
- name: setup python
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: execute py script
run: |
python ./scripts/check_setupcfg_and_requirements_equal.py
6 changes: 6 additions & 0 deletions df_to_azure/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,9 @@ class WrongDtypeError(Exception):
"""For the dtypes argument we only accept SQLAlchemy types"""

pass


class DoubleColumnNamesError(Exception):
"""For writing to Azure we do not accept double column names"""

pass
5 changes: 3 additions & 2 deletions df_to_azure/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from df_to_azure.adf import ADF
from df_to_azure.db import SqlUpsert, auth_azure, execute_stmt
from df_to_azure.exceptions import WrongDtypeError
from df_to_azure.utils import test_uniqueness_columns, wait_until_pipeline_is_done
from df_to_azure.utils import test_unique_column_names, test_uniqueness_columns, wait_until_pipeline_is_done


def df_to_azure(
Expand Down Expand Up @@ -121,7 +121,7 @@ def upload_dataset(self):
self.create_schema()
self.push_to_azure()
if self.method == "upsert":
# key columns have to be unique for upsert.
# Key columns need only unique values for upsert
test_uniqueness_columns(self.df, self.id_field)
upsert = SqlUpsert(
table_name=self.table_name,
Expand Down Expand Up @@ -286,6 +286,7 @@ def __init__(self, df: pd.DataFrame, tablename: str, folder: str, method: str, i
self.upload_name = self.set_upload_name(folder)
self.connection_string = os.environ.get("AZURE_STORAGE_CONNECTION_STRING")
self._checks()
test_unique_column_names(self.df)

def _checks(self):
if self.method == "upsert" and not self.id_field:
Expand Down
2 changes: 2 additions & 0 deletions df_to_azure/settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Union

from pandas import DataFrame
from df_to_azure.utils import test_unique_column_names


class TableParameters:
Expand All @@ -20,6 +21,7 @@ def __init__(
# checks
self.check_method()
self.check_upsert()
test_unique_column_names(df)

def check_method(self):
valid_methods = ["create", "append", "upsert"]
Expand Down
13 changes: 13 additions & 0 deletions df_to_azure/tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from df_to_azure import df_to_azure
from df_to_azure.db import auth_azure
from df_to_azure.exceptions import DoubleColumnNamesError

logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
secrets_to_environment(keyvault_name="df-to-azure")
Expand Down Expand Up @@ -181,3 +182,15 @@ def test_convert_bigint():

expected = DataFrame({"COLUMN_NAME": ["A", "B"], "DATA_TYPE": ["bigint", "int"]})
assert_frame_equal(result, expected)


def test_double_column_names():
df_double_names = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30], "C": ["X", "Y", "Z"]})
df_double_names = df_double_names.rename(columns={"C": "A"})
with pytest.raises(DoubleColumnNamesError):
df_to_azure(
df=df_double_names,
tablename="double_column_names",
schema="test",
wait_till_finished=True,
)
11 changes: 10 additions & 1 deletion df_to_azure/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import time

from df_to_azure.exceptions import PipelineRunError
from df_to_azure.exceptions import PipelineRunError, DoubleColumnNamesError


def print_item(group):
Expand Down Expand Up @@ -85,4 +85,13 @@ def wait_until_pipeline_is_done(adf_client, run_response):


def test_uniqueness_columns(df, id_columns):
"""Test whether values in the id columns are unique"""
assert df[id_columns].duplicated().sum() == 0, "When using UPSERT, key columns must be unique."


def test_unique_column_names(df, cols: list = None):
"""Column names should be unique"""
if cols:
df = df[cols]
if df.columns.duplicated().sum() != 0:
raise DoubleColumnNamesError("Column names are not unique.")
8 changes: 8 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
azure-identity==1.6.0
azure-mgmt-datafactory==1.1.0
azure-mgmt-resource==19.0.0
azure-storage-blob==12.8.1
pandas==1.3.2
pyarrow==5.0.0
pyodbc==4.0.31
sqlalchemy==1.4.22
26 changes: 26 additions & 0 deletions scripts/check_setupcfg_and_requirements_equal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import os
import pathlib

from setuptools.config import read_configuration


def get_config():
repo_path = pathlib.Path(__file__).parent.parent.absolute()
config_setup = read_configuration(os.path.join(repo_path, "setup.cfg"))
config_requirements = config_setup["options"]["install_requires"]

return config_requirements, repo_path


def check():
config_requirements, repo_path = get_config()

with open(os.path.join(repo_path, "requirements.txt")) as f:
requirements_txt = f.read().splitlines()

assert sorted(config_requirements) == sorted(requirements_txt), "Requirements are not equal"
print("Requirements and setup.cfg and both are equal")


if __name__ == "__main__":
check()
27 changes: 27 additions & 0 deletions scripts/generate_requirements_from_setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import os
import pathlib

from setuptools.config import read_configuration


def get_config():
repo_path = pathlib.Path(__file__).parent.parent.absolute()
config_setup = read_configuration(os.path.join(repo_path, "setup.cfg"))
config_requirements = config_setup["options"]["install_requires"]

return config_requirements, repo_path


def generate_requirements():
config_requirements, repo_path = get_config()

with open(os.path.join(repo_path, "requirements.txt"), "w") as f:
f.write("\n".join(config_requirements))

print(
"Generated requirements.txt from setup.cfg, with the following requirements\n", "\n".join(config_requirements)
)


if __name__ == "__main__":
generate_requirements()