From 6531815b2ae49cbde984d5f2b9ebc9ba2ae4f9ba Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 11:44:25 +0100 Subject: [PATCH 01/12] Fix bugs and add requirements checks --- .github/workflows/check_reqs.yaml | 22 +++++++++++++++ df_to_azure/export.py | 6 ++--- df_to_azure/settings.py | 2 ++ df_to_azure/utils.py | 8 ++++-- requirements.txt | 8 ++++++ .../check_setupcfg_and_requirements_equal.py | 26 ++++++++++++++++++ scripts/generate_requirements_from_setup.py | 27 +++++++++++++++++++ 7 files changed, 93 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/check_reqs.yaml create mode 100644 requirements.txt create mode 100644 scripts/check_setupcfg_and_requirements_equal.py create mode 100644 scripts/generate_requirements_from_setup.py diff --git a/.github/workflows/check_reqs.yaml b/.github/workflows/check_reqs.yaml new file mode 100644 index 0000000..46662ed --- /dev/null +++ b/.github/workflows/check_reqs.yaml @@ -0,0 +1,22 @@ +name: Check requirements equal + +on: + push: + branches: + - '*' + - '!main' + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: checkout repo content + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: execute py script # run the run.py to get the latest data + run: | + python ./scripts/check_setupcfg_and_requirementst_equal.py diff --git a/df_to_azure/export.py b/df_to_azure/export.py index 8b28e42..764b3e5 100644 --- a/df_to_azure/export.py +++ b/df_to_azure/export.py @@ -15,7 +15,7 @@ from df_to_azure.adf import ADF from df_to_azure.db import SqlUpsert, auth_azure, execute_stmt from df_to_azure.exceptions import WrongDtypeError -from df_to_azure.utils import test_uniqueness_columns, wait_until_pipeline_is_done +from df_to_azure.utils import test_unique_column_names, wait_until_pipeline_is_done def df_to_azure( @@ -121,8 +121,6 @@ def upload_dataset(self): self.create_schema() self.push_to_azure() if self.method == "upsert": - # key columns have to be unique for upsert. - test_uniqueness_columns(self.df, self.id_field) upsert = SqlUpsert( table_name=self.table_name, schema=self.schema, @@ -286,6 +284,7 @@ def __init__(self, df: pd.DataFrame, tablename: str, folder: str, method: str, i self.upload_name = self.set_upload_name(folder) self.connection_string = os.environ.get("AZURE_STORAGE_CONNECTION_STRING") self._checks() + test_unique_column_names(self.df) def _checks(self): if self.method == "upsert" and not self.id_field: @@ -365,7 +364,6 @@ def run(self): container_client = blob_service_client.get_container_client(container="parquet") if self.method == "upsert": - test_uniqueness_columns(self.df, self.id_field) downloaded_blob = container_client.download_blob(self.upload_name) bytes_io = BytesIO(downloaded_blob.readall()) df_existing = pd.read_parquet(bytes_io) diff --git a/df_to_azure/settings.py b/df_to_azure/settings.py index 5b25e0d..6a65146 100644 --- a/df_to_azure/settings.py +++ b/df_to_azure/settings.py @@ -1,6 +1,7 @@ from typing import Union from pandas import DataFrame +from df_to_azure.utils import test_unique_column_names class TableParameters: @@ -20,6 +21,7 @@ def __init__( # checks self.check_method() self.check_upsert() + test_unique_column_names(df) def check_method(self): valid_methods = ["create", "append", "upsert"] diff --git a/df_to_azure/utils.py b/df_to_azure/utils.py index d0efcd5..75d8c2e 100644 --- a/df_to_azure/utils.py +++ b/df_to_azure/utils.py @@ -84,5 +84,9 @@ def wait_until_pipeline_is_done(adf_client, run_response): raise PipelineRunError("Pipeline is running too long") -def test_uniqueness_columns(df, id_columns): - assert df[id_columns].duplicated().sum() == 0, "When using UPSERT, key columns must be unique." +def test_unique_column_names(df, cols: list = None): + """Column names should be unique""" + if cols: + df = df[cols] + if df.columns.duplicated().sum() != 0: + raise ValueError("Column names are not unique.") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..234e265 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +azure-identity==1.6.0 +azure-mgmt-datafactory==1.1.0 +azure-mgmt-resource==19.0.0 +azure-storage-blob==12.8.1 +pandas==1.3.2 +pyarrow==5.0.0 +pyodbc==4.0.31 +sqlalchemy==1.4.22 \ No newline at end of file diff --git a/scripts/check_setupcfg_and_requirements_equal.py b/scripts/check_setupcfg_and_requirements_equal.py new file mode 100644 index 0000000..2730dd8 --- /dev/null +++ b/scripts/check_setupcfg_and_requirements_equal.py @@ -0,0 +1,26 @@ +import os +import pathlib + +from setuptools.config import read_configuration + + +def get_config(): + repo_path = pathlib.Path(__file__).parent.parent.absolute() + config_setup = read_configuration(os.path.join(repo_path, "setup.cfg")) + config_requirements = config_setup["options"]["install_requires"] + + return config_requirements, repo_path + + +def check(): + config_requirements, repo_path = get_config() + + with open(os.path.join(repo_path, "requirements.txt")) as f: + requirements_txt = f.read().splitlines() + + assert sorted(config_requirements) == sorted(requirements_txt), "Requirements are not equal" + print("Requirements and setup.cfg and both are equal") + + +if __name__ == "__main__": + check() diff --git a/scripts/generate_requirements_from_setup.py b/scripts/generate_requirements_from_setup.py new file mode 100644 index 0000000..f764159 --- /dev/null +++ b/scripts/generate_requirements_from_setup.py @@ -0,0 +1,27 @@ +import os +import pathlib + +from setuptools.config import read_configuration + + +def get_config(): + repo_path = pathlib.Path(__file__).parent.parent.absolute() + config_setup = read_configuration(os.path.join(repo_path, "setup.cfg")) + config_requirements = config_setup["options"]["install_requires"] + + return config_requirements, repo_path + + +def generate_requirements(): + config_requirements, repo_path = get_config() + + with open(os.path.join(repo_path, "requirements.txt"), "w") as f: + f.write("\n".join(config_requirements)) + + print( + "Generated requirements.txt from setup.cfg, with the following requirements\n", "\n".join(config_requirements) + ) + + +if __name__ == "__main__": + generate_requirements() From f8523fcc16cf859f6ad9445132b2728b7cbdc3ab Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 11:47:43 +0100 Subject: [PATCH 02/12] FIx yaml --- .github/workflows/check_reqs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_reqs.yaml b/.github/workflows/check_reqs.yaml index 46662ed..2b1f124 100644 --- a/.github/workflows/check_reqs.yaml +++ b/.github/workflows/check_reqs.yaml @@ -19,4 +19,4 @@ jobs: python-version: 3.8 - name: execute py script # run the run.py to get the latest data run: | - python ./scripts/check_setupcfg_and_requirementst_equal.py + python scripts/check_setupcfg_and_requirementst_equal.py From 850a586a4bd0921d6774af77a271eae72176b666 Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 11:51:09 +0100 Subject: [PATCH 03/12] FIx yaml --- .github/workflows/check_reqs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_reqs.yaml b/.github/workflows/check_reqs.yaml index 2b1f124..5bcfa1f 100644 --- a/.github/workflows/check_reqs.yaml +++ b/.github/workflows/check_reqs.yaml @@ -19,4 +19,4 @@ jobs: python-version: 3.8 - name: execute py script # run the run.py to get the latest data run: | - python scripts/check_setupcfg_and_requirementst_equal.py + python ~/scripts/check_setupcfg_and_requirementst_equal.py From 141ede721fcf8a243a8961898ad8629023fdd8ff Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 11:53:41 +0100 Subject: [PATCH 04/12] FIx yaml --- .github/workflows/check_reqs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check_reqs.yaml b/.github/workflows/check_reqs.yaml index 5bcfa1f..46662ed 100644 --- a/.github/workflows/check_reqs.yaml +++ b/.github/workflows/check_reqs.yaml @@ -19,4 +19,4 @@ jobs: python-version: 3.8 - name: execute py script # run the run.py to get the latest data run: | - python ~/scripts/check_setupcfg_and_requirementst_equal.py + python ./scripts/check_setupcfg_and_requirementst_equal.py From a55c6cf61a3c8e28f7864cd2800053d2b56ccf40 Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 11:59:56 +0100 Subject: [PATCH 05/12] FIx yaml --- .github/workflows/ci.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fbe2845..18fb3da 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -35,3 +35,12 @@ jobs: - name: Python Black Check run: | black --line-length=120 --check df_to_azure + - name: assert equality between setup.cfg and requirements.txt + uses: actions/checkout@v2 + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + - name: execute py script + run: | + python ./scripts/check_setupcfg_and_requirementst_equal.py From b52a94a57fce74a39e6664edf770616b3c752811 Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 12:01:25 +0100 Subject: [PATCH 06/12] Delete yaml --- .github/workflows/check_reqs.yaml | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 .github/workflows/check_reqs.yaml diff --git a/.github/workflows/check_reqs.yaml b/.github/workflows/check_reqs.yaml deleted file mode 100644 index 46662ed..0000000 --- a/.github/workflows/check_reqs.yaml +++ /dev/null @@ -1,22 +0,0 @@ -name: Check requirements equal - -on: - push: - branches: - - '*' - - '!main' - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - name: checkout repo content - uses: actions/checkout@v2 - - name: setup python - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: execute py script # run the run.py to get the latest data - run: | - python ./scripts/check_setupcfg_and_requirementst_equal.py From 0136a49ff62af9f70f38d90129a970a6eacecf08 Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 13:39:01 +0100 Subject: [PATCH 07/12] Delete yaml --- .github/workflows/ci.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 18fb3da..0fe7168 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -42,5 +42,6 @@ jobs: with: python-version: 3.8 - name: execute py script + uses: actions/checkout@v2 run: | python ./scripts/check_setupcfg_and_requirementst_equal.py From 20607f57823b3186e094bd87eef8ce924058f643 Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 13:40:20 +0100 Subject: [PATCH 08/12] Reverse change --- .github/workflows/ci.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0fe7168..18fb3da 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -42,6 +42,5 @@ jobs: with: python-version: 3.8 - name: execute py script - uses: actions/checkout@v2 run: | python ./scripts/check_setupcfg_and_requirementst_equal.py From 51181899ea2d3f9834df35f21dad741f6829d2e0 Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Mon, 28 Feb 2022 13:46:57 +0100 Subject: [PATCH 09/12] Fix file name --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 18fb3da..b320a6e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -43,4 +43,4 @@ jobs: python-version: 3.8 - name: execute py script run: | - python ./scripts/check_setupcfg_and_requirementst_equal.py + python ./scripts/check_setupcfg_and_requirements_equal.py From 506ad88ba97e2682bbc1473bb1f24c19de0449ea Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Tue, 1 Mar 2022 14:03:34 +0100 Subject: [PATCH 10/12] Add test and fix check uniqueness --- df_to_azure/exceptions.py | 5 +++++ df_to_azure/export.py | 5 ++++- df_to_azure/tests/test_general.py | 13 +++++++++++++ df_to_azure/utils.py | 9 +++++++-- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/df_to_azure/exceptions.py b/df_to_azure/exceptions.py index 1bc1058..a087e90 100644 --- a/df_to_azure/exceptions.py +++ b/df_to_azure/exceptions.py @@ -22,3 +22,8 @@ class WrongDtypeError(Exception): """For the dtypes argument we only accept SQLAlchemy types""" pass + + +class DoubleColumnNamesError(Exception): + """For writing to Azure we do not accept double column names""" + pass diff --git a/df_to_azure/export.py b/df_to_azure/export.py index 764b3e5..6b93c0e 100644 --- a/df_to_azure/export.py +++ b/df_to_azure/export.py @@ -15,7 +15,7 @@ from df_to_azure.adf import ADF from df_to_azure.db import SqlUpsert, auth_azure, execute_stmt from df_to_azure.exceptions import WrongDtypeError -from df_to_azure.utils import test_unique_column_names, wait_until_pipeline_is_done +from df_to_azure.utils import test_unique_column_names, test_uniqueness_column, wait_until_pipeline_is_done def df_to_azure( @@ -121,6 +121,8 @@ def upload_dataset(self): self.create_schema() self.push_to_azure() if self.method == "upsert": + # Key columns need only unqiue values for upsert + test_uniqueness_column(self.df, self.id_field) upsert = SqlUpsert( table_name=self.table_name, schema=self.schema, @@ -364,6 +366,7 @@ def run(self): container_client = blob_service_client.get_container_client(container="parquet") if self.method == "upsert": + test_uniqueness_column(self.df, self.id_field) downloaded_blob = container_client.download_blob(self.upload_name) bytes_io = BytesIO(downloaded_blob.readall()) df_existing = pd.read_parquet(bytes_io) diff --git a/df_to_azure/tests/test_general.py b/df_to_azure/tests/test_general.py index e42bfc3..8e14261 100644 --- a/df_to_azure/tests/test_general.py +++ b/df_to_azure/tests/test_general.py @@ -8,6 +8,7 @@ from df_to_azure import df_to_azure from df_to_azure.db import auth_azure +from df_to_azure.exceptions import DoubleColumnNamesError logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING) secrets_to_environment(keyvault_name="df-to-azure") @@ -181,3 +182,15 @@ def test_convert_bigint(): expected = DataFrame({"COLUMN_NAME": ["A", "B"], "DATA_TYPE": ["bigint", "int"]}) assert_frame_equal(result, expected) + + +def test_double_column_names(): + df_double_names = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30], "C": ["X", "Y", "Z"]}) + df_double_names = df_double_names.rename(columns={"C": "A"}) + with pytest.raises(DoubleColumnNamesError): + df_to_azure( + df=df_double_names, + tablename="double_column_names", + schema="test", + wait_till_finished=True, + ) diff --git a/df_to_azure/utils.py b/df_to_azure/utils.py index 75d8c2e..c36231a 100644 --- a/df_to_azure/utils.py +++ b/df_to_azure/utils.py @@ -2,7 +2,7 @@ import os import time -from df_to_azure.exceptions import PipelineRunError +from df_to_azure.exceptions import PipelineRunError, DoubleColumnNamesError def print_item(group): @@ -84,9 +84,14 @@ def wait_until_pipeline_is_done(adf_client, run_response): raise PipelineRunError("Pipeline is running too long") +def test_uniqueness_column(df, id_columns): + """Test whether values in the id columns are unique""" + assert df[id_columns].duplicated().sum() == 0, "When using UPSERT, key columns must be unique." + + def test_unique_column_names(df, cols: list = None): """Column names should be unique""" if cols: df = df[cols] if df.columns.duplicated().sum() != 0: - raise ValueError("Column names are not unique.") + raise DoubleColumnNamesError("Column names are not unique.") From 456fb5e151bd04648aa9edb361a584ffd3ae5905 Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Tue, 1 Mar 2022 14:06:19 +0100 Subject: [PATCH 11/12] FIx spelling mistakes --- df_to_azure/export.py | 8 ++++---- df_to_azure/utils.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/df_to_azure/export.py b/df_to_azure/export.py index 6b93c0e..6a60ec0 100644 --- a/df_to_azure/export.py +++ b/df_to_azure/export.py @@ -15,7 +15,7 @@ from df_to_azure.adf import ADF from df_to_azure.db import SqlUpsert, auth_azure, execute_stmt from df_to_azure.exceptions import WrongDtypeError -from df_to_azure.utils import test_unique_column_names, test_uniqueness_column, wait_until_pipeline_is_done +from df_to_azure.utils import test_unique_column_names, test_uniqueness_columns, wait_until_pipeline_is_done def df_to_azure( @@ -121,8 +121,8 @@ def upload_dataset(self): self.create_schema() self.push_to_azure() if self.method == "upsert": - # Key columns need only unqiue values for upsert - test_uniqueness_column(self.df, self.id_field) + # Key columns need only unique values for upsert + test_uniqueness_columns(self.df, self.id_field) upsert = SqlUpsert( table_name=self.table_name, schema=self.schema, @@ -366,7 +366,7 @@ def run(self): container_client = blob_service_client.get_container_client(container="parquet") if self.method == "upsert": - test_uniqueness_column(self.df, self.id_field) + test_uniqueness_columns(self.df, self.id_field) downloaded_blob = container_client.download_blob(self.upload_name) bytes_io = BytesIO(downloaded_blob.readall()) df_existing = pd.read_parquet(bytes_io) diff --git a/df_to_azure/utils.py b/df_to_azure/utils.py index c36231a..c587b30 100644 --- a/df_to_azure/utils.py +++ b/df_to_azure/utils.py @@ -84,7 +84,7 @@ def wait_until_pipeline_is_done(adf_client, run_response): raise PipelineRunError("Pipeline is running too long") -def test_uniqueness_column(df, id_columns): +def test_uniqueness_columns(df, id_columns): """Test whether values in the id columns are unique""" assert df[id_columns].duplicated().sum() == 0, "When using UPSERT, key columns must be unique." From 0879a4974ff3910906401c283880ec2d8f275a96 Mon Sep 17 00:00:00 2001 From: Tim van der Heijden Date: Tue, 1 Mar 2022 14:07:51 +0100 Subject: [PATCH 12/12] Fix black --- df_to_azure/exceptions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/df_to_azure/exceptions.py b/df_to_azure/exceptions.py index a087e90..d0a23d3 100644 --- a/df_to_azure/exceptions.py +++ b/df_to_azure/exceptions.py @@ -26,4 +26,5 @@ class WrongDtypeError(Exception): class DoubleColumnNamesError(Exception): """For writing to Azure we do not accept double column names""" + pass