From 9caa66e4ad543f593a63576727f34ba4e5a25610 Mon Sep 17 00:00:00 2001 From: James Weakley Date: Fri, 19 Mar 2021 19:22:25 +1100 Subject: [PATCH] Partial SQL Server support --- README.md | 24 ++++++++--------- dbt_project.yml | 2 +- integration_tests/dbt_project.yml | 12 ++++----- .../equality_with_numeric_tolerance.sql | 5 ++++ .../quantile_transformer_model_macro.sql | 22 +++++----------- .../macros/standard_scaler_model_macro.sql | 16 ++++++++++++ ...tile_transformer_result_with_tolerance.sql | 10 ++++--- .../sql/test_k_bins_discretizer_50_bins.sql | 8 +----- .../test_k_bins_discretizer_default_bins.sql | 9 +------ .../models/sql/test_label_encoder.sql | 7 +---- .../models/sql/test_max_abs_scaler.sql | 8 +----- ...t_max_abs_scaler_with_column_selection.sql | 7 +---- .../models/sql/test_min_max_scaler.sql | 8 +----- ...t_min_max_scaler_with_column_selection.sql | 8 +----- .../models/sql/test_normalizer.sql | 8 +----- .../models/sql/test_one_hot_encoder.sql | 8 +----- ...test_one_hot_encoder_category_selected.sql | 6 +---- .../models/sql/test_robust_scaler.sql | 8 +----- .../models/sql/test_standard_scaler.sql | 17 +++--------- macros/k_bins_discretizer.sql | 23 ++++++++++++++++ macros/label_encoder.sql | 8 ++++++ macros/quantile_transformer.sql | 5 ++-- macros/quantile_transformer.yml | 4 --- macros/robust_scaler.sql | 26 +++++++++++++++++++ 24 files changed, 128 insertions(+), 131 deletions(-) create mode 100644 integration_tests/macros/standard_scaler_model_macro.sql diff --git a/README.md b/README.md index 58e5ad3..ee1a2d4 100644 --- a/README.md +++ b/README.md @@ -4,21 +4,21 @@ A package for dbt which enables standardization of data sets. You can use it to The package contains a set of macros that mirror the functionality of the [scikit-learn preprocessing module](https://scikit-learn.org/stable/modules/preprocessing.html). Originally they were developed as part of the 2019 Medium article [Feature Engineering in Snowflake](https://medium.com/omnata/feature-engineering-in-snowflake-4312032e0d53). -Currently they have been tested in Snowflake, Redshift and BigQuery. The test case expectations have been built using scikit-learn (see *.py in [integration_tests/data/sql](integration_tests/data/sql)), so you can expect behavioural parity with it. +Currently they have been tested in Snowflake, Redshift , BigQuery, and SQL Server. The test case expectations have been built using scikit-learn (see *.py in [integration_tests/data/sql](integration_tests/data/sql)), so you can expect behavioural parity with it. The macros are: -| scikit-learn function | macro name | Snowflake | BigQuery | Redshift | Example | +| scikit-learn function | macro name | Snowflake | BigQuery | Redshift | MSSQL | Example | | --- | --- | --- | --- | --- | --- | -| [KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html#sklearn.preprocessing.KBinsDiscretizer)| k_bins_discretizer | Y | Y | Y | ![example](images/k_bins.gif) | -| [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder)| label_encoder | Y | Y | Y | ![example](images/label_encoder.gif) | -| [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html#sklearn.preprocessing.MaxAbsScaler) | max_abs_scaler | Y | Y | Y | [![example](images/max_abs_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#maxabsscaler) | -| [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler) | min_max_scaler | Y | Y | Y | [![example](images/min_max_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#minmaxscaler) | -| [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer) | normalizer | Y | Y | Y | [![example](images/normalizer.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#normalizer) | -| [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder) | one_hot_encoder | Y | Y | Y | ![example](images/one_hot_encoder.gif) | -| [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer) | quantile_transformer | Y | Y | N | [![example](images/quantile_transformer.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#quantiletransformer-uniform-output) | -| [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) | robust_scaler | Y | Y | Y | [![example](images/robust_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#robustscaler) | -| [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler) | standard_scaler | Y | Y | Y | [![example](images/standard_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#standardscaler) | +| [KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html#sklearn.preprocessing.KBinsDiscretizer)| k_bins_discretizer | Y | Y | Y | N | ![example](images/k_bins.gif) | +| [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder)| label_encoder | Y | Y | Y | Y | ![example](images/label_encoder.gif) | +| [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html#sklearn.preprocessing.MaxAbsScaler) | max_abs_scaler | Y | Y | Y | Y | [![example](images/max_abs_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#maxabsscaler) | +| [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html#sklearn.preprocessing.MinMaxScaler) | min_max_scaler | Y | Y | Y | N | [![example](images/min_max_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#minmaxscaler) | +| [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer) | normalizer | Y | Y | Y | Y | [![example](images/normalizer.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#normalizer) | +| [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder) | one_hot_encoder | Y | Y | Y | Y | ![example](images/one_hot_encoder.gif) | +| [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer) | quantile_transformer | Y | Y | N | N | [![example](images/quantile_transformer.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#quantiletransformer-uniform-output) | +| [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler) | robust_scaler | Y | Y | Y | N | [![example](images/robust_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#robustscaler) | +| [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler) | standard_scaler | Y | Y | Y | N | [![example](images/standard_scaler.png)](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#standardscaler) | _\* 2D charts taken from [scikit-learn.org](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html), GIFs are my own_ ## Installation @@ -26,7 +26,7 @@ To use this in your dbt project, create or modify packages.yml to include: ``` packages: - package: "omnata-labs/dbt_ml_preprocessing" - version: [">=1.0.0"] + version: [">=1.0.1"] ``` _(replace the revision number with the latest)_ diff --git a/dbt_project.yml b/dbt_project.yml index 13531e2..0175002 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -1,5 +1,5 @@ name: 'dbt_ml_preprocessing' -version: '1.0.0' +version: '1.0.1' require-dbt-version: ">=0.15.1" diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 954deca..096937a 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -71,15 +71,15 @@ seeds: data_one_hot_encoder_expected: +column_types: - is_column_to_encode_A: boolean - is_column_to_encode_B: boolean - is_column_to_encode_C: boolean - is_column_to_encode_D: boolean + is_column_to_encode_A: "{{ 'int' if target['type']=='sqlserver' else 'boolean' }}" + is_column_to_encode_B: "{{ 'int' if target['type']=='sqlserver' else 'boolean' }}" + is_column_to_encode_C: "{{ 'int' if target['type']=='sqlserver' else 'boolean' }}" + is_column_to_encode_D: "{{ 'int' if target['type']=='sqlserver' else 'boolean' }}" data_one_hot_encoder_category_selected_expected: +column_types: - is_column_to_encode_A: boolean - is_column_to_encode_B: boolean + is_column_to_encode_A: "{{ 'int' if target['type']=='sqlserver' else 'boolean' }}" + is_column_to_encode_B: "{{ 'int' if target['type']=='sqlserver' else 'boolean' }}" data_quantile_transformer_expected: +column_types: diff --git a/integration_tests/macros/equality_with_numeric_tolerance.sql b/integration_tests/macros/equality_with_numeric_tolerance.sql index 87a267f..66d9438 100644 --- a/integration_tests/macros/equality_with_numeric_tolerance.sql +++ b/integration_tests/macros/equality_with_numeric_tolerance.sql @@ -56,6 +56,11 @@ from joined where percent_difference > {{ percentage_tolerance }} {% endmacro %} +{% macro sqlserver__test_equality_with_numeric_tolerance(model,compare_model,source_join_column,target_join_column,source_numeric_column_name,target_numeric_column_name,percentage_tolerance,output_all_rows=False) %} +{% do return( redshift__test_equality_with_numeric_tolerance(model,compare_model,source_join_column,target_join_column,source_numeric_column_name,target_numeric_column_name,percentage_tolerance,output_all_rows=False)) %} +{% endmacro %} + + {% macro snowflake__test_equality_with_numeric_tolerance(model,compare_model,source_join_column,target_join_column,source_numeric_column_name,target_numeric_column_name,percentage_tolerance,output_all_rows=False) %} {% set compare_cols_csv = compare_columns | join(', ') %} with a as ( diff --git a/integration_tests/macros/quantile_transformer_model_macro.sql b/integration_tests/macros/quantile_transformer_model_macro.sql index 19e5220..bb4cf1f 100644 --- a/integration_tests/macros/quantile_transformer_model_macro.sql +++ b/integration_tests/macros/quantile_transformer_model_macro.sql @@ -1,24 +1,16 @@ --- macro is only supported in Snowflake {% macro snowflake__quantile_transformer_model_macro() %} with data as ( - {{ dbt_ml_preprocessing.quantile_transformer( ref('data_quantile_transformer') ,'col_to_transform') }} - ) select * from data {% endmacro %} --- other adapters we generate an empty test result to force a test pass -{% macro bigquery__quantile_transformer_model_macro() %} -with data as ( - - {{ dbt_ml_preprocessing.quantile_transformer( ref('data_quantile_transformer') ,'col_to_transform') }} - -) -select * from data -{% endmacro %} - --- other adapters we generate an empty test result to force a test pass +-- macro not supported in other databases {% macro default__quantile_transformer_model_macro() %} -select 1 as empty_result from (select 1) where 1=2 +select 1 from (select 1) where 1=2 -- empty result set so that test passes {% endmacro %} + +-- macro not supported in sqlserver +{% macro sqlserver__quantile_transformer_model_macro() %} +select null as '1' where 1=2 -- empty result set so that test passes +{% endmacro %} \ No newline at end of file diff --git a/integration_tests/macros/standard_scaler_model_macro.sql b/integration_tests/macros/standard_scaler_model_macro.sql new file mode 100644 index 0000000..d4c0f28 --- /dev/null +++ b/integration_tests/macros/standard_scaler_model_macro.sql @@ -0,0 +1,16 @@ +{% macro default__standard_scaler_model_macro() %} +with data as ( + {{ dbt_ml_preprocessing.standard_scaler( ref('data_standard_scaler') ,['col_to_scale_1','col_to_scale_2']) }} +) +select id_col, + col_to_scale_1, + col_to_scale_2, + round(col_to_scale_1_scaled,10) as col_to_scale_1_scaled, + round(col_to_scale_2_scaled,10) as col_to_scale_2_scaled +from data +{% endmacro %} + +-- macro is not supported in mssql +{% macro sqlserver__standard_scaler_model_macro() %} +select null as '1' where 1=2 +{% endmacro %} diff --git a/integration_tests/macros/test_quantile_transformer_result_with_tolerance.sql b/integration_tests/macros/test_quantile_transformer_result_with_tolerance.sql index fa1cfbe..a7394ec 100644 --- a/integration_tests/macros/test_quantile_transformer_result_with_tolerance.sql +++ b/integration_tests/macros/test_quantile_transformer_result_with_tolerance.sql @@ -1,4 +1,3 @@ --- macro is only supported in Snowflake {% macro snowflake__test_quantile_transformer_result_with_tolerance() %} {{ snowflake__test_equality_with_numeric_tolerance('test_quantile_transformer', @@ -11,7 +10,12 @@ output_all_rows=True) }} {% endmacro %} --- other adapters we generate an empty test result to force a test pass +-- testing macro only works on Snowflake {% macro default__test_quantile_transformer_result_with_tolerance() %} select 1 from (select 1) where 1=2 -- empty result set so that test passes -{% endmacro %} \ No newline at end of file +{% endmacro %} + +-- testing macro not supported in sqlserver +{% macro sqlserver__test_quantile_transformer_result_with_tolerance() %} +select null as '1' where 1=2 -- empty result set so that test passes +{% endmacro %} diff --git a/integration_tests/models/sql/test_k_bins_discretizer_50_bins.sql b/integration_tests/models/sql/test_k_bins_discretizer_50_bins.sql index 982e124..c61a606 100644 --- a/integration_tests/models/sql/test_k_bins_discretizer_50_bins.sql +++ b/integration_tests/models/sql/test_k_bins_discretizer_50_bins.sql @@ -1,10 +1,4 @@ {{ config(materialized='view') }} -with data as ( +{{ dbt_ml_preprocessing.k_bins_discretizer( ref('data_k_bins_discretizer') ,['col_to_bin_1'],n_bins=50) }} - {{ dbt_ml_preprocessing.k_bins_discretizer( ref('data_k_bins_discretizer') ,['col_to_bin_1'],n_bins=50) }} - -) - -select * from data -order by id_col diff --git a/integration_tests/models/sql/test_k_bins_discretizer_default_bins.sql b/integration_tests/models/sql/test_k_bins_discretizer_default_bins.sql index 6ca5b4d..8d583e1 100644 --- a/integration_tests/models/sql/test_k_bins_discretizer_default_bins.sql +++ b/integration_tests/models/sql/test_k_bins_discretizer_default_bins.sql @@ -1,10 +1,3 @@ {{ config(materialized='view') }} -with data as ( - - {{ dbt_ml_preprocessing.k_bins_discretizer( ref('data_k_bins_discretizer') ,['col_to_bin_1','col_to_bin_2']) }} - -) - -select * from data -order by id_col \ No newline at end of file +{{ dbt_ml_preprocessing.k_bins_discretizer( ref('data_k_bins_discretizer') ,['col_to_bin_1','col_to_bin_2']) }} diff --git a/integration_tests/models/sql/test_label_encoder.sql b/integration_tests/models/sql/test_label_encoder.sql index 20cef84..c4ea4b3 100644 --- a/integration_tests/models/sql/test_label_encoder.sql +++ b/integration_tests/models/sql/test_label_encoder.sql @@ -1,9 +1,4 @@ {{ config(materialized='table') }} -- as a table because Redshift can't handle the equality checker query when it's a view -with data as ( +{{ dbt_ml_preprocessing.label_encoder( ref('data_label_encoder') ,'col_to_label_encode') }} - {{ dbt_ml_preprocessing.label_encoder( ref('data_label_encoder') ,'col_to_label_encode') }} - -) - -select * from data diff --git a/integration_tests/models/sql/test_max_abs_scaler.sql b/integration_tests/models/sql/test_max_abs_scaler.sql index 772a497..7fab730 100644 --- a/integration_tests/models/sql/test_max_abs_scaler.sql +++ b/integration_tests/models/sql/test_max_abs_scaler.sql @@ -1,9 +1,3 @@ {{ config(materialized='view') }} -with data as ( - - {{ dbt_ml_preprocessing.max_abs_scaler( ref('data_max_abs_scaler') ,['col_to_scale']) }} - -) - -select * from data +{{ dbt_ml_preprocessing.max_abs_scaler( ref('data_max_abs_scaler') ,['col_to_scale']) }} diff --git a/integration_tests/models/sql/test_max_abs_scaler_with_column_selection.sql b/integration_tests/models/sql/test_max_abs_scaler_with_column_selection.sql index dda3f9f..66e26eb 100644 --- a/integration_tests/models/sql/test_max_abs_scaler_with_column_selection.sql +++ b/integration_tests/models/sql/test_max_abs_scaler_with_column_selection.sql @@ -1,9 +1,4 @@ {{ config(materialized='view') }} -with data as ( +{{ dbt_ml_preprocessing.max_abs_scaler( ref('data_max_abs_scaler') ,['col_to_scale'],include_columns=['id_col']) }} - {{ dbt_ml_preprocessing.max_abs_scaler( ref('data_max_abs_scaler') ,['col_to_scale'],include_columns=['id_col']) }} - -) - -select * from data diff --git a/integration_tests/models/sql/test_min_max_scaler.sql b/integration_tests/models/sql/test_min_max_scaler.sql index ab2cf6a..2d15f81 100644 --- a/integration_tests/models/sql/test_min_max_scaler.sql +++ b/integration_tests/models/sql/test_min_max_scaler.sql @@ -1,9 +1,3 @@ {{ config(materialized='view') }} -with data as ( - - {{ dbt_ml_preprocessing.min_max_scaler( ref('data_max_abs_scaler') ,['col_to_scale']) }} - -) - -select * from data +{{ dbt_ml_preprocessing.min_max_scaler( ref('data_max_abs_scaler') ,['col_to_scale']) }} diff --git a/integration_tests/models/sql/test_min_max_scaler_with_column_selection.sql b/integration_tests/models/sql/test_min_max_scaler_with_column_selection.sql index 42f8724..a5a0429 100644 --- a/integration_tests/models/sql/test_min_max_scaler_with_column_selection.sql +++ b/integration_tests/models/sql/test_min_max_scaler_with_column_selection.sql @@ -1,9 +1,3 @@ {{ config(materialized='view') }} -with data as ( - - {{ dbt_ml_preprocessing.min_max_scaler( ref('data_max_abs_scaler') ,['col_to_scale'],include_columns=['id_col']) }} - -) - -select * from data +{{ dbt_ml_preprocessing.min_max_scaler( ref('data_max_abs_scaler') ,['col_to_scale'],include_columns=['id_col']) }} diff --git a/integration_tests/models/sql/test_normalizer.sql b/integration_tests/models/sql/test_normalizer.sql index f9b1d6f..06f6303 100644 --- a/integration_tests/models/sql/test_normalizer.sql +++ b/integration_tests/models/sql/test_normalizer.sql @@ -1,9 +1,3 @@ {{ config(materialized='view') }} -with data as ( - - {{ dbt_ml_preprocessing.normalizer( ref('data_normalizer') ,['col1','col2','col3','col4']) }} - -) - -select * from data +{{ dbt_ml_preprocessing.normalizer( ref('data_normalizer') ,['col1','col2','col3','col4']) }} diff --git a/integration_tests/models/sql/test_one_hot_encoder.sql b/integration_tests/models/sql/test_one_hot_encoder.sql index 4e9c907..81844ee 100644 --- a/integration_tests/models/sql/test_one_hot_encoder.sql +++ b/integration_tests/models/sql/test_one_hot_encoder.sql @@ -1,9 +1,3 @@ {{ config(materialized='view') }} -with data as ( - - {{ dbt_ml_preprocessing.one_hot_encoder( ref('data_one_hot_encoder') ,'column_to_encode',handle_unknown='ignore') }} - -) - -select * from data +{{ dbt_ml_preprocessing.one_hot_encoder( ref('data_one_hot_encoder') ,'column_to_encode',handle_unknown='ignore') }} diff --git a/integration_tests/models/sql/test_one_hot_encoder_category_selected.sql b/integration_tests/models/sql/test_one_hot_encoder_category_selected.sql index d41ab81..1ed447a 100644 --- a/integration_tests/models/sql/test_one_hot_encoder_category_selected.sql +++ b/integration_tests/models/sql/test_one_hot_encoder_category_selected.sql @@ -1,12 +1,8 @@ {{ config(materialized='view') }} -with data as ( - - {{ dbt_ml_preprocessing.one_hot_encoder( source_table=ref('data_one_hot_encoder'), +{{ dbt_ml_preprocessing.one_hot_encoder( source_table=ref('data_one_hot_encoder'), source_column='column_to_encode', categories=['A','B'], handle_unknown='ignore') }} -) -select * from data diff --git a/integration_tests/models/sql/test_robust_scaler.sql b/integration_tests/models/sql/test_robust_scaler.sql index a86a451..2dae0b9 100644 --- a/integration_tests/models/sql/test_robust_scaler.sql +++ b/integration_tests/models/sql/test_robust_scaler.sql @@ -1,9 +1,3 @@ {{ config(materialized='view') }} -with data as ( - - {{ dbt_ml_preprocessing.robust_scaler( ref('data_robust_scaler') ,['col_to_scale']) }} - -) - -select * from data +{{ dbt_ml_preprocessing.robust_scaler( ref('data_robust_scaler') ,['col_to_scale']) }} diff --git a/integration_tests/models/sql/test_standard_scaler.sql b/integration_tests/models/sql/test_standard_scaler.sql index 6af08b7..904fb6f 100644 --- a/integration_tests/models/sql/test_standard_scaler.sql +++ b/integration_tests/models/sql/test_standard_scaler.sql @@ -1,14 +1,5 @@ -{{ config(materialized='view') }} +{{ config(materialized='table') }} -with data as ( - - {{ dbt_ml_preprocessing.standard_scaler( ref('data_standard_scaler') ,['col_to_scale_1','col_to_scale_2']) }} - -) - -select id_col, - col_to_scale_1, - col_to_scale_2, - round(col_to_scale_1_scaled,10) as col_to_scale_1_scaled, - round(col_to_scale_2_scaled,10) as col_to_scale_2_scaled -from data +-- test model is generated by adapter-specific macro, +-- because the quantile_transformer is not supported by all DBs +{{ adapter.dispatch('standard_scaler_model_macro')() }} diff --git a/macros/k_bins_discretizer.sql b/macros/k_bins_discretizer.sql index 512675d..f5dd592 100644 --- a/macros/k_bins_discretizer.sql +++ b/macros/k_bins_discretizer.sql @@ -71,6 +71,29 @@ from {{ source_table }} as source_table {% endmacro %} +{% macro sqlserver__k_bins_discretizer(source_table,source_columns,include_columns,n_bins,encode,strategy) %} +select +{% for column in include_columns %} +source_table.{{ column }}, +{% endfor %} +{% for source_column in source_columns %} +case when + floor( + cast({{ source_column }} - {{ source_column }}_aggregates.min_value as decimal)/ cast( {{ source_column }}_aggregates.max_value - {{ source_column }}_aggregates.min_value as decimal ) * {{ n_bins }} + ) > {{ n_bins - 1 }} + then floor( + cast({{ source_column }} - {{ source_column }}_aggregates.min_value as decimal)/ cast( {{ source_column }}_aggregates.max_value - {{ source_column }}_aggregates.min_value as decimal ) * {{ n_bins }} + ) + else {{ n_bins - 1 }} + end as {{ source_column }}_binned + {% if not loop.last %}, {% endif %} +{% endfor %} +from + {% for source_column in source_columns %} + {{ source_column }}_aggregates, + {% endfor %} + {{ source_table }} as source_table +{% endmacro %} {% macro default__k_bins_discretizer(source_table,source_columns,include_columns,n_bins,encode,strategy) %} select diff --git a/macros/label_encoder.sql b/macros/label_encoder.sql index 87e034d..799c542 100644 --- a/macros/label_encoder.sql +++ b/macros/label_encoder.sql @@ -49,3 +49,11 @@ select (select row_num from numbered_distinct_values where distinct_value={{ source_column }}) as {{ source_column }}_encoded from {{ source_table }} {% endmacro %} + +{% macro synapse__label_encoder(source_table,source_column,include_columns) %} + {% do return( dbt_ml_preprocessing.redshift__label_encoder(source_table,source_column,include_columns)) %} +{%- endmacro %} + +{% macro sqlserver__label_encoder(source_table,source_column,include_columns) %} + {% do return( dbt_ml_preprocessing.redshift__label_encoder(source_table,source_column,include_columns)) %} +{%- endmacro %} diff --git a/macros/quantile_transformer.sql b/macros/quantile_transformer.sql index f1b33ae..d4382db 100644 --- a/macros/quantile_transformer.sql +++ b/macros/quantile_transformer.sql @@ -6,7 +6,7 @@ {{ adapter.dispatch('quantile_transformer',packages=['dbt_ml_preprocessing'])(source_table,source_column,n_quantiles,output_distribution,subsample,include_columns) }} {% endmacro %} -{% macro snowflake__quantile_transformer(source_table,source_column,n_quantiles,output_distribution,subsample,include_columns) %} +{% macro default__quantile_transformer(source_table,source_column,n_quantiles,output_distribution,subsample,include_columns) %} with quantile_values as( {% for quartile_index in range(n_quantiles) %} {% set quartile = quartile_index / (n_quantiles-1) %} @@ -65,8 +65,7 @@ coalesce(y1 + ((x-x1)/(x2-x1)) * (y2-y1),0) as {{ source_column }}_transformed from linear_interpolation_variables {% endmacro %} -{% macro default__quantile_transformer(source_table,source_column,n_quantiles,output_distribution,subsample,include_columns) %} - +{% macro redshift__quantile_transformer(source_table,source_column,n_quantiles,output_distribution,subsample,include_columns) %} {% set error_message %} The `quantile_transformer` macro is only supported on Snowflake and BigQuery at this time. It should work on other DBs, it just requires some rework. {% endset %} diff --git a/macros/quantile_transformer.yml b/macros/quantile_transformer.yml index 9f80016..80556fa 100644 --- a/macros/quantile_transformer.yml +++ b/macros/quantile_transformer.yml @@ -40,10 +40,6 @@ macros: docs: show: false - - name: snowflake__quantile_transformer - docs: - show: false - - name: bigquery__quantile_transformer docs: show: false diff --git a/macros/robust_scaler.sql b/macros/robust_scaler.sql index 3ae443c..f4d7ef9 100644 --- a/macros/robust_scaler.sql +++ b/macros/robust_scaler.sql @@ -103,4 +103,30 @@ from {% endfor %} {{ source_table }} as source_table +{% endmacro %} + +{% macro sqlserver__robust_scaler(source_table,source_columns,include_columns,with_centering,quantile_range) %} +with +{% for source_column in source_columns %} + {{ source_column }}_quartiles as( + select + percentile_cont({{ quantile_range[0] / 100 }}) within group (order by {{ source_column }}) OVER(PARTITION BY {{ source_column }}) as first_quartile, + percentile_cont({{ quantile_range[1] / 100 }}) within group (order by {{ source_column }}) OVER(PARTITION BY {{ source_column }}) as third_quartile + from {{ source_table }} + ) +{% if not loop.last %}, {% endif %} +{% endfor %} +select +{% for column in include_columns %} +source_table.{{ column }}, +{% endfor %} +{% for source_column in source_columns %} + ({{ source_column }} / ({{ source_column }}_quartiles.third_quartile - {{ source_column }}_quartiles.first_quartile)) as {{ source_column }}_scaled + {% if not loop.last %}, {% endif %} +{% endfor %} +from + {% for source_column in source_columns %} + {{ source_column }}_quartiles, + {% endfor %} + {{ source_table }} as source_table {% endmacro %} \ No newline at end of file