From a08bbcc69ce6ba8cd49e6748aad55021b7a7d525 Mon Sep 17 00:00:00 2001 From: Sam Debruyn Date: Mon, 21 Oct 2024 15:35:10 +0200 Subject: [PATCH] Add support for Azure SQL, Synapse, and Microsoft Fabric and extend support for SQL Server (#2160) * working fabric data source inheriting from sqlserver * fix failing tests * fix table creation in fabric * restore dev-reqs * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add email check for sqlserver and fabric * add test for email format * remove useless line * remove useless line * remove extra deps * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * enable auth with mssparkutils * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add fabric spark auth * Update tbump+version --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Milan Lukac Co-authored-by: Milan Lukac --- .env.example | 3 + dev-requirements.txt | 2 +- pytest.ini | 1 + requirements.txt | 1 + .../test_bug_double_metric_computation.py | 2 +- ...a_source_specific_aggregation_functions.py | 2 +- .../data_source/test_distribution_check.py | 4 +- soda/core/tests/data_source/test_formats.py | 8 +- soda/core/tests/data_source/test_freshness.py | 8 +- soda/core/tests/data_source/test_invalid.py | 4 +- .../data_source/test_metric_check_filter.py | 4 +- ...numerical_metric_checks_on_text_columns.py | 4 +- .../data_source/test_percentage_metrics.py | 2 +- .../tests/data_source/test_table_filter.py | 4 +- .../test_user_defined_metric_checks.py | 2 +- soda/fabric/LICENSE | 201 ++++++++++++++++++ soda/fabric/setup.py | 16 ++ .../soda/data_sources/fabric_data_source.py | 43 ++++ .../tests/fabric_data_source_fixture.py | 20 ++ soda/fabric/tests/test_fabric.py | 2 + soda/sqlserver/setup.py | 5 +- .../data_sources/sqlserver_data_source.py | 188 +++++++++++++--- tbump.toml | 4 + 23 files changed, 480 insertions(+), 50 deletions(-) create mode 100644 soda/fabric/LICENSE create mode 100644 soda/fabric/setup.py create mode 100644 soda/fabric/soda/data_sources/fabric_data_source.py create mode 100644 soda/fabric/tests/fabric_data_source_fixture.py create mode 100644 soda/fabric/tests/test_fabric.py diff --git a/.env.example b/.env.example index 7853168ba..1434fee6a 100644 --- a/.env.example +++ b/.env.example @@ -51,3 +51,6 @@ CONTRACTS_POSTGRES_PASSWORD=*** CONTRACTS_POSTGRES_DATABASE=*** ATLAN_API_KEY=*** + +FABRIC_ENDPOINT=*** +FABRIC_DWH=*** diff --git a/dev-requirements.txt b/dev-requirements.txt index 101208ffe..4a965bb7a 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -175,4 +175,4 @@ zipp==3.19.2 # The following packages are considered to be unsafe in a requirements file: # pip -# setuptools +# setuptools \ No newline at end of file diff --git a/pytest.ini b/pytest.ini index d66b066b1..d5788bdbb 100644 --- a/pytest.ini +++ b/pytest.ini @@ -24,3 +24,4 @@ pythonpath = soda/teradata/tests soda/contracts/tests soda/oracle/tests + soda/fabric/tests diff --git a/requirements.txt b/requirements.txt index 2a1cecba1..3cfd16eba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ ./soda/teradata ./soda/contracts ./soda/atlan +./soda/fabric diff --git a/soda/core/tests/data_source/test_bug_double_metric_computation.py b/soda/core/tests/data_source/test_bug_double_metric_computation.py index 720f6862e..6fe03127e 100644 --- a/soda/core/tests/data_source/test_bug_double_metric_computation.py +++ b/soda/core/tests/data_source/test_bug_double_metric_computation.py @@ -5,7 +5,7 @@ @pytest.mark.skipif( - test_data_source == "sqlserver", + test_data_source in ["fabric", "sqlserver"], reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.", ) def test_double_metric_computation(data_source_fixture: DataSourceFixture): diff --git a/soda/core/tests/data_source/test_data_source_specific_aggregation_functions.py b/soda/core/tests/data_source/test_data_source_specific_aggregation_functions.py index c1f1a4258..8645050f0 100644 --- a/soda/core/tests/data_source/test_data_source_specific_aggregation_functions.py +++ b/soda/core/tests/data_source/test_data_source_specific_aggregation_functions.py @@ -31,7 +31,7 @@ def test_data_source_specific_statistics_aggregation_metrics(data_source_fixture supported_checks.pop("stddev_samp") # TODO see what's going wrong with Vertica later: # Message: Function APPROXIMATE_PERCENTILE(int) does not exist - if test_data_source in ["sqlserver", "mysql", "spark_df", "oracle", "vertica"]: + if test_data_source in ["sqlserver", "mysql", "spark_df", "oracle", "vertica", "fabric"]: supported_checks = {} if supported_checks: diff --git a/soda/core/tests/data_source/test_distribution_check.py b/soda/core/tests/data_source/test_distribution_check.py index 1255bbbb9..35a16255a 100644 --- a/soda/core/tests/data_source/test_distribution_check.py +++ b/soda/core/tests/data_source/test_distribution_check.py @@ -132,7 +132,7 @@ def test_distribution_sql(data_source_fixture: DataSourceFixture, mock_file_syst table_name=table_name, schema_name=f"{data_source_fixture.data_source.database}.{data_source_fixture.schema_name}.", ) - elif test_data_source == "sqlserver": + elif test_data_source in ["fabric", "sqlserver"]: expectation = "SELECT TOP 1000000 \n cst_size \nFROM {schema_name}{table_name}" assert scan._checks[0].query.sql == expectation.format( table_name=table_name, schema_name=f"{data_source_fixture.schema_name}." @@ -498,7 +498,7 @@ def test_continuous_distribution_check_large_sample_size(data_source_fixture: Da data_source_name = data_source_fixture.data_source_name if data_source_name in ["spark_df", "dask"]: assert sorted(distro_check.query.rows) == sorted([[1.0], [1.0], [2.0], [2.0], [3.0]]) - elif data_source_name in ["snowflake", "bigquery", "sqlserver"]: + elif data_source_name in ["snowflake", "bigquery", "sqlserver", "fabric"]: assert len(distro_check.query.rows) == 5 else: assert distro_check.query.rows == sorted([(1.0,), (1.0,), (2.0,), (2.0,), (3.0,)]) diff --git a/soda/core/tests/data_source/test_formats.py b/soda/core/tests/data_source/test_formats.py index 54fed586f..e15947d6e 100644 --- a/soda/core/tests/data_source/test_formats.py +++ b/soda/core/tests/data_source/test_formats.py @@ -7,6 +7,10 @@ def test_formats(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) test_definitions = { + "email": { + "passing_values": ["info@soda.io", "some+email@gmail.com", "a@b.be"], + "failing_values": ["", "a", " ", "1.5", "4,2", "@@@@@"], + }, "integer": { "passing_values": ["0", "1234567890", "-0", "- 1234567890", "+0", "+1"], "failing_values": ["", "a", " ", "1.5", "4,2"], @@ -133,7 +137,7 @@ def test_formats(data_source_fixture: DataSourceFixture): }, } - if test_data_source == "sqlserver": + if test_data_source in ["fabric", "sqlserver"]: test_definitions.pop("percentage") # Partially supported. test_definitions.pop("date us") # Partially supported. test_definitions.pop("date eu") # Partially supported. @@ -159,7 +163,7 @@ def assert_format_values(format, data_source_fixture: DataSourceFixture, table_n def set_up_expression(value: str, format: str) -> str: expression = data_source.get_default_format_expression(f"'{value}'", format) # Special handling for sqlserver and teradata - expression matching cannot be used in the SELECT statement, so wrap it in CASE ... THEN ... ELSE for this test. - if test_data_source in ["sqlserver", "teradata"]: + if test_data_source in ["sqlserver", "teradata", "fabric"]: expression = f"CASE WHEN {expression} THEN 1 ELSE 0 END" return expression diff --git a/soda/core/tests/data_source/test_freshness.py b/soda/core/tests/data_source/test_freshness.py index 3858e0396..30c863f22 100644 --- a/soda/core/tests/data_source/test_freshness.py +++ b/soda/core/tests/data_source/test_freshness.py @@ -116,7 +116,7 @@ def test_freshness_with_table_filter(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) where_cond = ( f"""CONVERT(DATETIME,'${{START_TIME}}') <= ts AND ts < CONVERT(DATETIME,'${{END_TIME}}')""" - if test_data_source == "sqlserver" + if test_data_source in ["fabric", "sqlserver"] else f"""TIMESTAMP '${{START_TIME}}' <= ts AND ts < TIMESTAMP '${{END_TIME}}'""" ) @@ -146,7 +146,7 @@ def test_freshness_with_table_filter(data_source_fixture: DataSourceFixture): def test_freshness_no_rows(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) # There is no boolean type and variables in Teradata - cond = "1 = 0" if test_data_source in ["sqlserver", "teradata"] else "FALSE" + cond = "1 = 0" if test_data_source in ["sqlserver", "teradata", "fabric"] else "FALSE" scan = data_source_fixture.create_test_scan() scan.add_variables( { @@ -174,7 +174,7 @@ def test_freshness_with_check_filter(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) where_cond = ( f"""CONVERT(DATETIME,'${{START_TIME}}') <= ts AND ts < CONVERT(DATETIME,'${{END_TIME}}')""" - if test_data_source == "sqlserver" + if test_data_source in ["fabric", "sqlserver"] else f"""TIMESTAMP '${{START_TIME}}' <= ts AND ts < TIMESTAMP '${{END_TIME}}'""" ) @@ -206,7 +206,7 @@ def test_freshness_with_check_filter(data_source_fixture: DataSourceFixture): def test_freshness_check_filter_no_rows(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) # There is no boolean type and variables in Teradata - cond = "1 = 0" if test_data_source in ["sqlserver", "teradata"] else "FALSE" + cond = "1 = 0" if test_data_source in ["sqlserver", "teradata", "fabric"] else "FALSE" scan = data_source_fixture.create_test_scan() scan.add_variables( { diff --git a/soda/core/tests/data_source/test_invalid.py b/soda/core/tests/data_source/test_invalid.py index 9029ccefe..b27957eb4 100644 --- a/soda/core/tests/data_source/test_invalid.py +++ b/soda/core/tests/data_source/test_invalid.py @@ -86,7 +86,7 @@ def test_valid_min_max(data_source_fixture: DataSourceFixture): @pytest.mark.skipif( - test_data_source == "sqlserver", + test_data_source in ["fabric", "sqlserver"], reason="Full regex support is not supported by SQLServer", ) def test_valid_format_email(data_source_fixture: DataSourceFixture): @@ -107,7 +107,7 @@ def test_valid_format_email(data_source_fixture: DataSourceFixture): @pytest.mark.skipif( - test_data_source == "sqlserver", + test_data_source in ["fabric", "sqlserver"], reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.", ) def test_column_configured_invalid_and_missing_values(data_source_fixture: DataSourceFixture): diff --git a/soda/core/tests/data_source/test_metric_check_filter.py b/soda/core/tests/data_source/test_metric_check_filter.py index a3a9400c1..a501ce187 100644 --- a/soda/core/tests/data_source/test_metric_check_filter.py +++ b/soda/core/tests/data_source/test_metric_check_filter.py @@ -65,7 +65,7 @@ def test_missing_filtered_sample_query(data_source_fixture: DataSourceFixture): @pytest.mark.skipif( - test_data_source == "sqlserver", + test_data_source in ["fabric", "sqlserver"], reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.", ) def test_valid_filtered(data_source_fixture: DataSourceFixture): @@ -88,7 +88,7 @@ def test_valid_filtered(data_source_fixture: DataSourceFixture): @pytest.mark.skipif( - test_data_source == "sqlserver", + test_data_source in ["fabric", "sqlserver"], reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.", ) def test_valid_percentage_filtered(data_source_fixture: DataSourceFixture): diff --git a/soda/core/tests/data_source/test_numerical_metric_checks_on_text_columns.py b/soda/core/tests/data_source/test_numerical_metric_checks_on_text_columns.py index a5e7d9138..7f89f87f7 100644 --- a/soda/core/tests/data_source/test_numerical_metric_checks_on_text_columns.py +++ b/soda/core/tests/data_source/test_numerical_metric_checks_on_text_columns.py @@ -5,7 +5,7 @@ @pytest.mark.skipif( - test_data_source in ["sqlserver"], + test_data_source in ["sqlserver", "fabric"], reason="Full regex support is not supported by SQLServer. REGEXP_REPLACE is used in this check but it is not supported.", ) def test_numeric_metric_checks_on_text_column(data_source_fixture: DataSourceFixture): @@ -32,7 +32,7 @@ def test_numeric_metric_checks_on_text_column(data_source_fixture: DataSourceFix @pytest.mark.skipif( - test_data_source in ["sqlserver"], + test_data_source in ["sqlserver", "fabric"], reason="Full regex support is not supported by SQLServer. REGEXP_REPLACE is used in this check but it is not supported.", ) def test_numeric_metric_checks_on_text_column_local_format(data_source_fixture: DataSourceFixture): diff --git a/soda/core/tests/data_source/test_percentage_metrics.py b/soda/core/tests/data_source/test_percentage_metrics.py index 064a71d48..f22f40214 100644 --- a/soda/core/tests/data_source/test_percentage_metrics.py +++ b/soda/core/tests/data_source/test_percentage_metrics.py @@ -5,7 +5,7 @@ @pytest.mark.skipif( - test_data_source == "sqlserver", + test_data_source in ["fabric", "sqlserver"], reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.", ) def test_default_missing_percentage(data_source_fixture: DataSourceFixture): diff --git a/soda/core/tests/data_source/test_table_filter.py b/soda/core/tests/data_source/test_table_filter.py index 64fd5e146..44bc2b85a 100644 --- a/soda/core/tests/data_source/test_table_filter.py +++ b/soda/core/tests/data_source/test_table_filter.py @@ -14,7 +14,7 @@ def test_filter_on_date(data_source_fixture: DataSourceFixture): scan.add_variables( {"DATE_LOWER": "2020-06-23", "DATE_UPPER": "2020-06-24"} ) # use DATE_LOWER and DATE_UPPER to avoid issues with dask - date_expr = "" if test_data_source == "sqlserver" else "DATE" + date_expr = "" if test_data_source in ["fabric", "sqlserver"] else "DATE" scan.add_sodacl_yaml_str( f""" filter {table_name} [daily]: @@ -69,7 +69,7 @@ def test_table_filter_on_timestamp(data_source_fixture: DataSourceFixture): table_name = data_source_fixture.ensure_test_table(customers_test_table) scan = data_source_fixture.create_test_scan() - if test_data_source == "sqlserver": + if test_data_source in ["fabric", "sqlserver"]: where_cond = f"""CONVERT(DATETIME, '${{ts_start}}') <= ts AND ts < CONVERT(DATETIME,'${{ts_end}}')""" elif test_data_source == "dask": where_cond = f"""\"'${{ts_start}}' <= ts AND ts < '${{ts_end}}'\"""" diff --git a/soda/core/tests/data_source/test_user_defined_metric_checks.py b/soda/core/tests/data_source/test_user_defined_metric_checks.py index 6baad84ab..781ac91ef 100644 --- a/soda/core/tests/data_source/test_user_defined_metric_checks.py +++ b/soda/core/tests/data_source/test_user_defined_metric_checks.py @@ -10,7 +10,7 @@ def test_user_defined_table_expression_metric_check(data_source_fixture: DataSou table_name = data_source_fixture.ensure_test_table(customers_test_table) scan = data_source_fixture.create_test_scan() - length_expr = "LEN" if data_source_fixture.data_source_name == "sqlserver" else "LENGTH" + length_expr = "LEN" if data_source_fixture.data_source_name in ["sqlserver", "fabric"] else "LENGTH" ones_expression = f"SUM({length_expr}(cst_size_txt))" diff --git a/soda/fabric/LICENSE b/soda/fabric/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/soda/fabric/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/soda/fabric/setup.py b/soda/fabric/setup.py new file mode 100644 index 000000000..7713bef4b --- /dev/null +++ b/soda/fabric/setup.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python + +from setuptools import find_namespace_packages, setup + +package_name = "soda-core-fabric" +package_version = "3.3.22" +description = "Soda Core Microsoft Fabric Package" + +requires = [f"soda-core-sqlserver=={package_version}"] +# TODO Fix the params +setup( + name=package_name, + version=package_version, + install_requires=requires, + packages=find_namespace_packages(include=["soda*"]), +) diff --git a/soda/fabric/soda/data_sources/fabric_data_source.py b/soda/fabric/soda/data_sources/fabric_data_source.py new file mode 100644 index 000000000..f0f48a50f --- /dev/null +++ b/soda/fabric/soda/data_sources/fabric_data_source.py @@ -0,0 +1,43 @@ +from soda.data_sources.sqlserver_data_source import SQLServerDataSource +from soda.execution.data_type import DataType + + +class FabricDataSource(SQLServerDataSource): + TYPE = "fabric" + + SCHEMA_CHECK_TYPES_MAPPING: dict = {"TEXT": ["varchar", "char"]} + + SQL_TYPE_FOR_CREATE_TABLE_MAP: dict = { + DataType.TEXT: "varchar(255)", + DataType.INTEGER: "int", + DataType.DECIMAL: "float", + DataType.DATE: "date", + DataType.TIME: "time", + DataType.TIMESTAMP: "datetime2(6)", + DataType.TIMESTAMP_TZ: "datetime2(6)", + DataType.BOOLEAN: "bit", + } + + SQL_TYPE_FOR_SCHEMA_CHECK_MAP: dict = { + DataType.TEXT: "varchar", + DataType.INTEGER: "int", + DataType.DECIMAL: "float", + DataType.DATE: "date", + DataType.TIME: "time(6)", + DataType.TIMESTAMP: "datetime2", + DataType.TIMESTAMP_TZ: "datetime2", + DataType.BOOLEAN: "bit", + } + + NUMERIC_TYPES_FOR_PROFILING = [ + "bigint", + "numeric", + "bit", + "smallint", + "decimal", + "int", + "float", + "real", + ] + + TEXT_TYPES_FOR_PROFILING = ["char", "varchar"] diff --git a/soda/fabric/tests/fabric_data_source_fixture.py b/soda/fabric/tests/fabric_data_source_fixture.py new file mode 100644 index 000000000..b691e3460 --- /dev/null +++ b/soda/fabric/tests/fabric_data_source_fixture.py @@ -0,0 +1,20 @@ +import os + +from sqlserver_data_source_fixture import SQLServerDataSourceFixture + + +class FabricDataSourceFixture(SQLServerDataSourceFixture): + def _build_configuration_dict(self, schema_name: str | None = None) -> dict: + return { + "data_source fabric": { + "type": "fabric", + "host": os.getenv("FABRIC_ENDPOINT"), + "database": os.getenv("FABRIC_DWH"), + "schema": schema_name or os.getenv("FABRIC_SCHEMA", "dbo"), + "driver": os.getenv("FABRIC_DRIVER", "ODBC Driver 18 for SQL Server"), + "client_id": os.getenv("FABRIC_CLIENT_ID"), + "client_secret": os.getenv("FABRIC_CLIENT_SECRET"), + "encrypt": True, + "authentication": os.getenv("FABRIC_AUTHENTICATION", "CLI"), + } + } diff --git a/soda/fabric/tests/test_fabric.py b/soda/fabric/tests/test_fabric.py new file mode 100644 index 000000000..287fa8dee --- /dev/null +++ b/soda/fabric/tests/test_fabric.py @@ -0,0 +1,2 @@ +def test_fabric(): + """Add plugin specific tests here. Present so that CI is simpler and to avoid false plugin-specific tests passing.""" diff --git a/soda/sqlserver/setup.py b/soda/sqlserver/setup.py index 854d6881f..fa52f9111 100644 --- a/soda/sqlserver/setup.py +++ b/soda/sqlserver/setup.py @@ -6,10 +6,7 @@ package_version = "3.3.22" description = "Soda Core SQL Server Package" -requires = [ - f"soda-core=={package_version}", - "pyodbc", -] +requires = [f"soda-core=={package_version}", "pyodbc", "azure-identity~=1.17.1"] # TODO Fix the params setup( name=package_name, diff --git a/soda/sqlserver/soda/data_sources/sqlserver_data_source.py b/soda/sqlserver/soda/data_sources/sqlserver_data_source.py index 62cf17391..f13e68ae5 100644 --- a/soda/sqlserver/soda/data_sources/sqlserver_data_source.py +++ b/soda/sqlserver/soda/data_sources/sqlserver_data_source.py @@ -3,10 +3,20 @@ import logging import re import struct +import time from datetime import datetime, timedelta, timezone +from itertools import chain, repeat from textwrap import dedent +from typing import Callable, Mapping import pyodbc +from azure.core.credentials import AccessToken +from azure.identity import ( + AzureCliCredential, + DefaultAzureCredential, + EnvironmentCredential, +) +from soda.__version__ import SODA_CORE_VERSION from soda.common.exceptions import DataSourceConnectionError from soda.common.logs import Logs from soda.execution.data_source import DataSource @@ -15,10 +25,88 @@ logger = logging.getLogger(__name__) +_AZURE_AUTH_FUNCTION_TYPE = Callable[..., AccessToken] +_SQL_COPT_SS_ACCESS_TOKEN = 1256 +_MAX_REMAINING_AZURE_ACCESS_TOKEN_LIFETIME = 300 +_AZURE_CREDENTIAL_SCOPE = "https://database.windows.net//.default" +_SYNAPSE_SPARK_CREDENTIAL_SCOPE = "DW" +_FABRIC_SPARK_CREDENTIAL_SCOPE = "https://analysis.windows.net/powerbi/api" + + +def _get_auto_access_token() -> AccessToken: + return DefaultAzureCredential().get_token(_AZURE_CREDENTIAL_SCOPE) + + +def _get_environment_access_token() -> AccessToken: + return EnvironmentCredential().get_token(_AZURE_CREDENTIAL_SCOPE) + + +def _get_azure_cli_access_token() -> AccessToken: + return AzureCliCredential().get_token(_AZURE_CREDENTIAL_SCOPE) + + +def _get_mssparkutils_access_token(scope: str) -> AccessToken: + from notebookutils import mssparkutils + + aad_token = mssparkutils.credentials.getToken(scope) + expires_on = int(time.time() + 4500.0) + token = AccessToken( + token=aad_token, + expires_on=expires_on, + ) + return token + + +def _get_synapse_spark_access_token() -> AccessToken: + return _get_mssparkutils_access_token(_SYNAPSE_SPARK_CREDENTIAL_SCOPE) + + +def _get_fabric_spark_access_token() -> AccessToken: + return _get_mssparkutils_access_token(_FABRIC_SPARK_CREDENTIAL_SCOPE) + + +_AZURE_AUTH_FUNCTIONS: Mapping[str, _AZURE_AUTH_FUNCTION_TYPE] = { + "auto": _get_auto_access_token, + "cli": _get_azure_cli_access_token, + "environment": _get_environment_access_token, + "synapsespark": _get_synapse_spark_access_token, + "fabricspark": _get_fabric_spark_access_token, +} + + +def convert_bytes_to_mswindows_byte_string(value): + encoded_bytes = bytes(chain.from_iterable(zip(value, repeat(0)))) + return struct.pack(" 0: + conn_params.append(f"ConnectRetryCount={int(self.connection_max_retries)}") + + if self.enable_tracing: + conn_params.append("SQL_ATTR_TRACE=SQL_OPT_TRACE_ON") + + if self.authentication.lower() == "sql": + conn_params.append(f"UID={{{self.username}}}") + conn_params.append(f"PWD={{{self.password}}}") + elif self.authentication.lower() == "activedirectoryinteractive": + conn_params.append("Authentication=ActiveDirectoryInteractive") + conn_params.append(f"UID={{{self.username}}}") + elif self.authentication.lower() == "activedirectorypassword": + conn_params.append("Authentication=ActiveDirectoryPassword") + conn_params.append(f"UID={{{self.username}}}") + conn_params.append(f"PWD={{{self.password}}}") + elif self.authentication.lower() == "activedirectoryserviceprincipal": + conn_params.append("Authentication=ActiveDirectoryServicePrincipal") + conn_params.append(f"UID={{{self.client_id}}}") + conn_params.append(f"PWD={{{self.client_secret}}}") + elif "activedirectory" in self.authentication.lower(): + conn_params.append(f"Authentication={self.authentication}") + + conn_params.append(f"APP=soda-core-fabric/{SODA_CORE_VERSION}") + + conn_str = ";".join(conn_params) + + return conn_str + + try: self.connection = pyodbc.connect( - ("Trusted_Connection=YES;" if self.trusted_connection else "") - + ("TrustServerCertificate=YES;" if self.trust_server_certificate else "") - + ("Encrypt=YES;" if self.encrypt else "") - + (f"{connection_parameters_string};" if connection_parameters_string else "") - + "DRIVER={" - + self.driver - + "};SERVER=" - + self.host - + "," - + str(self.port) - + ";DATABASE=" - + self.database - + ";UID=" - + self.username - + ";PWD=" - + self.password + build_connection_string(), + attrs_before=get_pyodbc_attrs(self.authentication), + timeout=int(self.login_timeout), ) self.connection.add_output_converter(-155, handle_datetimeoffset) diff --git a/tbump.toml b/tbump.toml index bfbd49295..65ff7045f 100644 --- a/tbump.toml +++ b/tbump.toml @@ -105,3 +105,7 @@ search = 'package_version = "{current_version}"' [[file]] src = "soda/atlan/setup.py" search = 'package_version = "{current_version}"' + +[[file]] +src = "soda/fabric/setup.py" +search = 'package_version = "{current_version}"'