Skip to content

Commit

Permalink
Add support for Azure SQL, Synapse, and Microsoft Fabric and extend s…
Browse files Browse the repository at this point in the history
…upport for SQL Server (#2160)

* working fabric data source inheriting from sqlserver

* fix failing tests

* fix table creation in fabric

* restore dev-reqs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add email check for sqlserver and fabric

* add test for email format

* remove useless line

* remove useless line

* remove extra deps

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* enable auth with mssparkutils

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add fabric spark auth

* Update tbump+version

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Milan Lukac <m1n0@users.noreply.github.com>
Co-authored-by: Milan Lukac <milan@lukac.online>
  • Loading branch information
4 people authored Oct 21, 2024
1 parent 52dc476 commit a08bbcc
Show file tree
Hide file tree
Showing 23 changed files with 480 additions and 50 deletions.
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,6 @@ CONTRACTS_POSTGRES_PASSWORD=***
CONTRACTS_POSTGRES_DATABASE=***

ATLAN_API_KEY=***

FABRIC_ENDPOINT=***
FABRIC_DWH=***
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,4 @@ zipp==3.19.2

# The following packages are considered to be unsafe in a requirements file:
# pip
# setuptools
# setuptools
1 change: 1 addition & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ pythonpath =
soda/teradata/tests
soda/contracts/tests
soda/oracle/tests
soda/fabric/tests
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@
./soda/teradata
./soda/contracts
./soda/atlan
./soda/fabric
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@pytest.mark.skipif(
test_data_source == "sqlserver",
test_data_source in ["fabric", "sqlserver"],
reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.",
)
def test_double_metric_computation(data_source_fixture: DataSourceFixture):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_data_source_specific_statistics_aggregation_metrics(data_source_fixture
supported_checks.pop("stddev_samp")
# TODO see what's going wrong with Vertica later:
# Message: Function APPROXIMATE_PERCENTILE(int) does not exist
if test_data_source in ["sqlserver", "mysql", "spark_df", "oracle", "vertica"]:
if test_data_source in ["sqlserver", "mysql", "spark_df", "oracle", "vertica", "fabric"]:
supported_checks = {}

if supported_checks:
Expand Down
4 changes: 2 additions & 2 deletions soda/core/tests/data_source/test_distribution_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def test_distribution_sql(data_source_fixture: DataSourceFixture, mock_file_syst
table_name=table_name,
schema_name=f"{data_source_fixture.data_source.database}.{data_source_fixture.schema_name}.",
)
elif test_data_source == "sqlserver":
elif test_data_source in ["fabric", "sqlserver"]:
expectation = "SELECT TOP 1000000 \n cst_size \nFROM {schema_name}{table_name}"
assert scan._checks[0].query.sql == expectation.format(
table_name=table_name, schema_name=f"{data_source_fixture.schema_name}."
Expand Down Expand Up @@ -498,7 +498,7 @@ def test_continuous_distribution_check_large_sample_size(data_source_fixture: Da
data_source_name = data_source_fixture.data_source_name
if data_source_name in ["spark_df", "dask"]:
assert sorted(distro_check.query.rows) == sorted([[1.0], [1.0], [2.0], [2.0], [3.0]])
elif data_source_name in ["snowflake", "bigquery", "sqlserver"]:
elif data_source_name in ["snowflake", "bigquery", "sqlserver", "fabric"]:
assert len(distro_check.query.rows) == 5
else:
assert distro_check.query.rows == sorted([(1.0,), (1.0,), (2.0,), (2.0,), (3.0,)])
Expand Down
8 changes: 6 additions & 2 deletions soda/core/tests/data_source/test_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ def test_formats(data_source_fixture: DataSourceFixture):
table_name = data_source_fixture.ensure_test_table(customers_test_table)

test_definitions = {
"email": {
"passing_values": ["info@soda.io", "some+email@gmail.com", "a@b.be"],
"failing_values": ["", "a", " ", "1.5", "4,2", "@@@@@"],
},
"integer": {
"passing_values": ["0", "1234567890", "-0", "- 1234567890", "+0", "+1"],
"failing_values": ["", "a", " ", "1.5", "4,2"],
Expand Down Expand Up @@ -133,7 +137,7 @@ def test_formats(data_source_fixture: DataSourceFixture):
},
}

if test_data_source == "sqlserver":
if test_data_source in ["fabric", "sqlserver"]:
test_definitions.pop("percentage") # Partially supported.
test_definitions.pop("date us") # Partially supported.
test_definitions.pop("date eu") # Partially supported.
Expand All @@ -159,7 +163,7 @@ def assert_format_values(format, data_source_fixture: DataSourceFixture, table_n
def set_up_expression(value: str, format: str) -> str:
expression = data_source.get_default_format_expression(f"'{value}'", format)
# Special handling for sqlserver and teradata - expression matching cannot be used in the SELECT statement, so wrap it in CASE ... THEN ... ELSE for this test.
if test_data_source in ["sqlserver", "teradata"]:
if test_data_source in ["sqlserver", "teradata", "fabric"]:
expression = f"CASE WHEN {expression} THEN 1 ELSE 0 END"

return expression
Expand Down
8 changes: 4 additions & 4 deletions soda/core/tests/data_source/test_freshness.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def test_freshness_with_table_filter(data_source_fixture: DataSourceFixture):
table_name = data_source_fixture.ensure_test_table(customers_test_table)
where_cond = (
f"""CONVERT(DATETIME,'${{START_TIME}}') <= ts AND ts < CONVERT(DATETIME,'${{END_TIME}}')"""
if test_data_source == "sqlserver"
if test_data_source in ["fabric", "sqlserver"]
else f"""TIMESTAMP '${{START_TIME}}' <= ts AND ts < TIMESTAMP '${{END_TIME}}'"""
)

Expand Down Expand Up @@ -146,7 +146,7 @@ def test_freshness_with_table_filter(data_source_fixture: DataSourceFixture):
def test_freshness_no_rows(data_source_fixture: DataSourceFixture):
table_name = data_source_fixture.ensure_test_table(customers_test_table)
# There is no boolean type and variables in Teradata
cond = "1 = 0" if test_data_source in ["sqlserver", "teradata"] else "FALSE"
cond = "1 = 0" if test_data_source in ["sqlserver", "teradata", "fabric"] else "FALSE"
scan = data_source_fixture.create_test_scan()
scan.add_variables(
{
Expand Down Expand Up @@ -174,7 +174,7 @@ def test_freshness_with_check_filter(data_source_fixture: DataSourceFixture):
table_name = data_source_fixture.ensure_test_table(customers_test_table)
where_cond = (
f"""CONVERT(DATETIME,'${{START_TIME}}') <= ts AND ts < CONVERT(DATETIME,'${{END_TIME}}')"""
if test_data_source == "sqlserver"
if test_data_source in ["fabric", "sqlserver"]
else f"""TIMESTAMP '${{START_TIME}}' <= ts AND ts < TIMESTAMP '${{END_TIME}}'"""
)

Expand Down Expand Up @@ -206,7 +206,7 @@ def test_freshness_with_check_filter(data_source_fixture: DataSourceFixture):
def test_freshness_check_filter_no_rows(data_source_fixture: DataSourceFixture):
table_name = data_source_fixture.ensure_test_table(customers_test_table)
# There is no boolean type and variables in Teradata
cond = "1 = 0" if test_data_source in ["sqlserver", "teradata"] else "FALSE"
cond = "1 = 0" if test_data_source in ["sqlserver", "teradata", "fabric"] else "FALSE"
scan = data_source_fixture.create_test_scan()
scan.add_variables(
{
Expand Down
4 changes: 2 additions & 2 deletions soda/core/tests/data_source/test_invalid.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def test_valid_min_max(data_source_fixture: DataSourceFixture):


@pytest.mark.skipif(
test_data_source == "sqlserver",
test_data_source in ["fabric", "sqlserver"],
reason="Full regex support is not supported by SQLServer",
)
def test_valid_format_email(data_source_fixture: DataSourceFixture):
Expand All @@ -107,7 +107,7 @@ def test_valid_format_email(data_source_fixture: DataSourceFixture):


@pytest.mark.skipif(
test_data_source == "sqlserver",
test_data_source in ["fabric", "sqlserver"],
reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.",
)
def test_column_configured_invalid_and_missing_values(data_source_fixture: DataSourceFixture):
Expand Down
4 changes: 2 additions & 2 deletions soda/core/tests/data_source/test_metric_check_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_missing_filtered_sample_query(data_source_fixture: DataSourceFixture):


@pytest.mark.skipif(
test_data_source == "sqlserver",
test_data_source in ["fabric", "sqlserver"],
reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.",
)
def test_valid_filtered(data_source_fixture: DataSourceFixture):
Expand All @@ -88,7 +88,7 @@ def test_valid_filtered(data_source_fixture: DataSourceFixture):


@pytest.mark.skipif(
test_data_source == "sqlserver",
test_data_source in ["fabric", "sqlserver"],
reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.",
)
def test_valid_percentage_filtered(data_source_fixture: DataSourceFixture):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@pytest.mark.skipif(
test_data_source in ["sqlserver"],
test_data_source in ["sqlserver", "fabric"],
reason="Full regex support is not supported by SQLServer. REGEXP_REPLACE is used in this check but it is not supported.",
)
def test_numeric_metric_checks_on_text_column(data_source_fixture: DataSourceFixture):
Expand All @@ -32,7 +32,7 @@ def test_numeric_metric_checks_on_text_column(data_source_fixture: DataSourceFix


@pytest.mark.skipif(
test_data_source in ["sqlserver"],
test_data_source in ["sqlserver", "fabric"],
reason="Full regex support is not supported by SQLServer. REGEXP_REPLACE is used in this check but it is not supported.",
)
def test_numeric_metric_checks_on_text_column_local_format(data_source_fixture: DataSourceFixture):
Expand Down
2 changes: 1 addition & 1 deletion soda/core/tests/data_source/test_percentage_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


@pytest.mark.skipif(
test_data_source == "sqlserver",
test_data_source in ["fabric", "sqlserver"],
reason="Full regex support is not supported by SQLServer. 'Percentage' format is supported but with limited functionality.",
)
def test_default_missing_percentage(data_source_fixture: DataSourceFixture):
Expand Down
4 changes: 2 additions & 2 deletions soda/core/tests/data_source/test_table_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_filter_on_date(data_source_fixture: DataSourceFixture):
scan.add_variables(
{"DATE_LOWER": "2020-06-23", "DATE_UPPER": "2020-06-24"}
) # use DATE_LOWER and DATE_UPPER to avoid issues with dask
date_expr = "" if test_data_source == "sqlserver" else "DATE"
date_expr = "" if test_data_source in ["fabric", "sqlserver"] else "DATE"
scan.add_sodacl_yaml_str(
f"""
filter {table_name} [daily]:
Expand Down Expand Up @@ -69,7 +69,7 @@ def test_table_filter_on_timestamp(data_source_fixture: DataSourceFixture):
table_name = data_source_fixture.ensure_test_table(customers_test_table)

scan = data_source_fixture.create_test_scan()
if test_data_source == "sqlserver":
if test_data_source in ["fabric", "sqlserver"]:
where_cond = f"""CONVERT(DATETIME, '${{ts_start}}') <= ts AND ts < CONVERT(DATETIME,'${{ts_end}}')"""
elif test_data_source == "dask":
where_cond = f"""\"'${{ts_start}}' <= ts AND ts < '${{ts_end}}'\""""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_user_defined_table_expression_metric_check(data_source_fixture: DataSou
table_name = data_source_fixture.ensure_test_table(customers_test_table)

scan = data_source_fixture.create_test_scan()
length_expr = "LEN" if data_source_fixture.data_source_name == "sqlserver" else "LENGTH"
length_expr = "LEN" if data_source_fixture.data_source_name in ["sqlserver", "fabric"] else "LENGTH"

ones_expression = f"SUM({length_expr}(cst_size_txt))"

Expand Down
Loading

0 comments on commit a08bbcc

Please sign in to comment.