Merge pull request #124 from zypp-io/development

Release to main
zypp-io · Mar 24, 2023 · 3523ee6 · 3523ee6
2 parents 8bae1b0 + 2e6b6cc
commit 3523ee6
Show file tree

Hide file tree

Showing 17 changed files with 81 additions and 56 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -24,17 +24,10 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install flake8 pytest black
-    - name: Lint with flake8
+        pip install -r requirements-dev.txt
+    - name: Run pre-commit
       run: |
-         # exit-zero treats all errors as warnings.
-         flake8 df_to_azure --count --ignore=E722 --max-complexity=10 --max-line-length=120 --statistics --per-file-ignores="__init__.py:F401"
-#    - name: Test with pytest
-#      run: |
-#        pytest df_to_azure
-    - name: Python Black Check
-      run: |
-        black --line-length=120 --check df_to_azure
+        pre-commit run --all-files
     - name: assert equality between setup.cfg and requirements.txt
       uses: actions/checkout@v2
     - name: setup python

diff --git a/.gitignore b/.gitignore
@@ -143,4 +143,4 @@ notebooks/
 settings.yml
 
 # mac
-.DS_Store
+.DS_Store
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,22 +1,37 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
+    rev: v4.3.0
     hooks:
     -   id: trailing-whitespace
     -   id: end-of-file-fixer
     -   id: check-yaml
     -   id: check-added-large-files
+    -   id: check-merge-conflict
+    -   id: debug-statements
+    -   id: detect-private-key
+    -   id: name-tests-test
+        args: [--pytest-test-first]
+    -   id: requirements-txt-fixer
 -   repo: https://github.com/pycqa/flake8
-    rev: 3.9.2
+    rev: 5.0.4
     hooks:
     -   id: flake8
+        args: ["--statistics", "--count", "--max-complexity=10", "--max-line-length=120", "--per-file-ignore=__init__.py: F401"]
 -   repo: https://github.com/psf/black
     rev: 22.3.0
     hooks:
     -   id: black
         args: [--line-length=120]
 -   repo: https://github.com/PyCQA/isort
-    rev: 5.9.1
+    rev: 5.12.0
     hooks:
     -   id: isort
         args: ["--profile", "black", --line-length=120]
+-   repo: local
+    hooks:
+    -   id: check-requirements
+        name: Check requirements
+        description: Check if requirements in setup.cfg and requirements.txt are equal
+        language: python
+        entry: python scripts/check_setupcfg_and_requirements_equal.py
+        pass_filenames: false
diff --git a/data/employee_1.csv b/data/employee_1.csv
@@ -9,4 +9,4 @@ employee_id,week_nr,hours
 333,30,10
 444,15,2
 444,16,4
-444,20,10
+444,20,10
diff --git a/data/employee_2.csv b/data/employee_2.csv
@@ -9,4 +9,4 @@ employee_id,week_nr,hours
 333,30,10
 444,15,99
 444,16,4
-444,20,99
+444,20,99
diff --git a/data/employee_duplicate_keys_1.csv b/data/employee_duplicate_keys_1.csv
@@ -9,4 +9,4 @@ employee_id,week_nr,hours
 333,30,10
 444,15,2
 444,15,4
-444,20,10
+444,20,10
diff --git a/data/employee_duplicate_keys_2.csv b/data/employee_duplicate_keys_2.csv
@@ -9,4 +9,4 @@ employee_id,week_nr,hours
 333,30,10
 444,15,99
 444,15,4
-444,20,99
+444,20,99
diff --git a/data/sample_1.csv b/data/sample_1.csv
@@ -1,4 +1,4 @@
 col_a,col_b,col_c
 1,test,X
 3,test,Z
-4,test,A
+4,test,A
diff --git a/data/sample_2.csv b/data/sample_2.csv
@@ -2,4 +2,4 @@ col_a,col_b,col_c
 1,updated value,E
 3,test,Z
 5,new value,F
-6,also new,H
+6,also new,H
diff --git a/df_to_azure/__init__.py b/df_to_azure/__init__.py
@@ -2,7 +2,7 @@
 
 from .export import df_to_azure
 
-__version__ = "0.8.0"
+__version__ = "0.9.0"
 
 logging.basicConfig(
     format="%(asctime)s.%(msecs)03d [%(levelname)-5s] [%(name)s] - %(message)s",

diff --git a/df_to_azure/db.py b/df_to_azure/db.py
@@ -71,14 +71,14 @@ def create_stored_procedure(self):
                 )
 
 
-def auth_azure():
+def auth_azure(driver: str = "ODBC Driver 17 for SQL Server"):
 
     connection_string = "mssql+pyodbc://{}:{}@{}:1433/{}?driver={}".format(
         os.environ.get("SQL_USER"),
         quote_plus(os.environ.get("SQL_PW")),
         os.environ.get("SQL_SERVER"),
         os.environ.get("SQL_DB"),
-        "ODBC Driver 17 for SQL Server",
+        driver,
     )
     con = create_engine(connection_string).connect()
 

diff --git a/df_to_azure/export.py b/df_to_azure/export.py
@@ -1,6 +1,5 @@
 import logging
 import os
-import sys
 from datetime import datetime
 from io import BytesIO
 from typing import Union
@@ -9,7 +8,18 @@
 import pandas as pd
 from azure.storage.blob import BlobServiceClient
 from numpy import dtype
-from pandas import BooleanDtype, DataFrame, Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, StringDtype
+from pandas import (
+    BooleanDtype,
+    CategoricalDtype,
+    DataFrame,
+    DatetimeTZDtype,
+    Float64Dtype,
+    Int8Dtype,
+    Int16Dtype,
+    Int32Dtype,
+    Int64Dtype,
+    StringDtype,
+)
 from sqlalchemy.sql.visitors import VisitableType
 from sqlalchemy.types import BigInteger, Boolean, DateTime, Integer, Numeric, String
 
@@ -97,6 +107,11 @@ def __init__(
         self.clean_staging = clean_staging
 
     def run(self):
+
+        if self.df.empty:
+            logging.info("Data empty, no new records to upload.")
+            return None, None
+
         if self.create:
 
             # azure components
@@ -134,10 +149,6 @@ def _checks(self):
 
     def upload_dataset(self):
 
-        if self.df.empty:
-            logging.info("Data empty, no new records to upload.")
-            sys.exit(1)
-
         if self.method == "create":
             self.create_schema()
             self.push_to_azure()
@@ -186,7 +197,7 @@ def upload_to_blob(self):
             blob=f"{self.table_name}/{self.table_name}.csv",
         )
 
-        data = self.df.to_csv(index=False, sep="^", quotechar='"', line_terminator="\n")
+        data = self.df.to_csv(index=False, sep="^", quotechar='"', lineterminator="\n")
         blob_client.upload_blob(data, overwrite=True)
 
     def create_schema(self):
@@ -234,12 +245,15 @@ def column_types(self) -> dict:
             Int16Dtype(): Integer(),
             Int32Dtype(): Integer(),
             Int64Dtype(): Integer(),
+            Float64Dtype(): numeric,
             dtype("float64"): numeric,
             dtype("float32"): numeric,
             dtype("float16"): numeric,
             dtype("<M8[ns]"): DateTime(),
             dtype("bool"): Boolean(),
             BooleanDtype(): Boolean(),
+            DatetimeTZDtype(tz="utc"): DateTime(),
+            CategoricalDtype(): string,
         }
 
         col_types = {col_name: type_conversion[col_type] for col_name, col_type in self.df.dtypes.to_dict().items()}

diff --git a/df_to_azure/settings.py b/df_to_azure/settings.py
@@ -1,6 +1,7 @@
 from typing import Union
 
 from pandas import DataFrame
+
 from df_to_azure.utils import test_unique_column_names
 
 

diff --git a/df_to_azure/tests/test_general.py b/df_to_azure/tests/test_general.py
@@ -161,14 +161,15 @@ def test_pipeline_name():
 def test_empty_dataframe():
     df = DataFrame()
 
-    with pytest.raises(SystemExit):
-        df_to_azure(
-            df=df,
-            tablename="empty_dataframe",
-            schema="test",
-            method="create",
-            wait_till_finished=True,
-        )
+    adf_client, run_response = df_to_azure(
+        df=df,
+        tablename="empty_dataframe",
+        schema="test",
+        method="create",
+        wait_till_finished=True,
+    )
+    assert adf_client is None
+    assert run_response is None
 
 
 def test_convert_bigint():

diff --git a/df_to_azure/utils.py b/df_to_azure/utils.py
@@ -2,7 +2,7 @@
 import os
 import time
 
-from df_to_azure.exceptions import PipelineRunError, DoubleColumnNamesError
+from df_to_azure.exceptions import DoubleColumnNamesError, PipelineRunError
 
 
 def print_item(group):

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
-azure-identity~=1.7.1
-azure-mgmt-datafactory~=2.2.0
-azure-mgmt-resource~=20.1.0
-azure-storage-blob~=12.8.1
-pandas~=1.4.1
-pyarrow~=7.0.0
-pyodbc~=4.0.32
-sqlalchemy~=1.4.31
+azure-identity>=1.7.1
+azure-mgmt-datafactory>=2.2.0,<2.7.0
+azure-mgmt-resource>=20.1.0
+azure-storage-blob>=12.8.1
+pandas>=1.5.0
+pyarrow>=7.0.0
+pyodbc>=4.0.32
+sqlalchemy>=1.4.31,<2.0.0
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = df_to_azure
-version = 0.8.0
+version = 0.9.0
 author = Melvin Folkers, Erfan Nariman
 author_email = melvin@zypp.io, erfan@zypp.io
 description = Automatically write pandas DataFrames to SQL by creating pipelines in Azure Data Factory with copy activity from blob to SQL
@@ -20,14 +20,15 @@ classifiers =
 packages = df_to_azure
 python_requires = >=3.7
 install_requires =
-    azure-identity~=1.7.1
-    azure-mgmt-datafactory~=2.2.0
-    azure-mgmt-resource~=20.1.0
-    azure-storage-blob~=12.8.1
-    pandas~=1.4.1
-    pyarrow~=7.0.0
-    pyodbc~=4.0.32
-    sqlalchemy~=1.4.31
+    azure-identity>=1.7.1
+    azure-mgmt-datafactory>=2.2.0,<2.7.0
+    azure-mgmt-resource>=20.1.0
+    azure-storage-blob>=12.8.1
+    pandas>=1.5.0
+    pyarrow>=7.0.0
+    pyodbc>=4.0.32
+    sqlalchemy>=1.4.31,<2.0.0
+
 
 
 [flake8]