One hot encoder enhancement: Conoromara/one hot encoder (#4)

Significant changes to the OneHotEncoder macro, hence new major version * Output columns now follow Gitlab SQL naming conventions * Fix case where category values contain whitespace * Provide flexibility with excluding source table columns * Support scikit-learn's handle_unknown strategy of 'error', this is now the default * Update doco, bump version Also added a couple of Redshift fixes to get all tests to pass. Co-authored-by: James Weakley <jameswillisweakley@gmail.com>
omnata-labs · Mar 1, 2021 · d267f23 · d267f23
1 parent 32bc366
commit d267f23
Show file tree

Hide file tree

Showing 12 changed files with 124 additions and 76 deletions.
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ To use this in your dbt project, create or modify packages.yml to include:
 ```
 packages:
   - package: "omnata-labs/dbt_ml_preprocessing"
-    version: [">=0.7.0"]
+    version: [">=1.0.0"]
 ```
 _(replace the revision number with the latest)_
 

diff --git a/dbt_project.yml b/dbt_project.yml
@@ -1,5 +1,5 @@
 name: 'dbt_ml_preprocessing'
-version: '0.7.0'
+version: '1.0.0'
 
 require-dbt-version: ">=0.15.1"
 

diff --git a/integration_tests/data/sql/data_one_hot_encoder_category_selected_expected.csv b/integration_tests/data/sql/data_one_hot_encoder_category_selected_expected.csv
@@ -1,4 +1,4 @@
-id_col,column_to_encode,column_to_encode_A,column_to_encode_B
+id_col,column_to_encode,is_column_to_encode_A,is_column_to_encode_B
 1,A,1,0
 2,B,0,1
 3,C,0,0

diff --git a/integration_tests/data/sql/data_one_hot_encoder_expected.csv b/integration_tests/data/sql/data_one_hot_encoder_expected.csv
@@ -1,4 +1,4 @@
-id_col,column_to_encode,column_to_encode_A,column_to_encode_B,column_to_encode_C,column_to_encode_D
+id_col,column_to_encode,is_column_to_encode_A,is_column_to_encode_B,is_column_to_encode_C,is_column_to_encode_D
 1,A,1,0,0,0
 2,B,0,1,0,0
 3,C,0,0,1,0

diff --git a/integration_tests/data/sql/generate_one_hot_encoder_expected.py b/integration_tests/data/sql/generate_one_hot_encoder_expected.py
@@ -7,8 +7,8 @@
 
 transformed_columns=transformer.transform(second_column).toarray()
 print(transformed_columns)
-test_dataset_df['column_to_encode_A']=transformed_columns[:, 0].astype(int)
-test_dataset_df['column_to_encode_B']=transformed_columns[:, 1].astype(int)
-test_dataset_df['column_to_encode_C']=transformed_columns[:, 2].astype(int)
-test_dataset_df['column_to_encode_D']=transformed_columns[:, 3].astype(int)
+test_dataset_df['is_column_to_encode_A']=transformed_columns[:, 0].astype(int)
+test_dataset_df['is_column_to_encode_B']=transformed_columns[:, 1].astype(int)
+test_dataset_df['is_column_to_encode_C']=transformed_columns[:, 2].astype(int)
+test_dataset_df['is_column_to_encode_D']=transformed_columns[:, 3].astype(int)
 test_dataset_df.to_csv("data_one_hot_encoder_expected.csv",index=False)
diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml
@@ -71,15 +71,15 @@ seeds:
 
       data_one_hot_encoder_expected:
         +column_types:
-          column_to_encode_A: boolean
-          column_to_encode_B: boolean
-          column_to_encode_C: boolean
-          column_to_encode_D: boolean
+          is_column_to_encode_A: boolean
+          is_column_to_encode_B: boolean
+          is_column_to_encode_C: boolean
+          is_column_to_encode_D: boolean
 
       data_one_hot_encoder_category_selected_expected:
         +column_types:
-          column_to_encode_A: boolean
-          column_to_encode_B: boolean
+          is_column_to_encode_A: boolean
+          is_column_to_encode_B: boolean
 
       data_quantile_transformer_expected:
         +column_types:

diff --git a/integration_tests/models/sql/test_one_hot_encoder.sql b/integration_tests/models/sql/test_one_hot_encoder.sql
@@ -2,7 +2,7 @@
 
 with data as (
 
-    {{ dbt_ml_preprocessing.one_hot_encoder( ref('data_one_hot_encoder') ,'column_to_encode') }}
+    {{ dbt_ml_preprocessing.one_hot_encoder( ref('data_one_hot_encoder') ,'column_to_encode',handle_unknown='ignore') }}
 
 )
 

diff --git a/integration_tests/models/sql/test_one_hot_encoder_category_selected.sql b/integration_tests/models/sql/test_one_hot_encoder_category_selected.sql
@@ -4,7 +4,8 @@ with data as (
 
     {{ dbt_ml_preprocessing.one_hot_encoder( source_table=ref('data_one_hot_encoder'),
                                             source_column='column_to_encode',
-                                            categories=['A','B']) }}
+                                            categories=['A','B'],
+                                            handle_unknown='ignore') }}
 
 )
 

diff --git a/macros/k_bins_discretizer.sql b/macros/k_bins_discretizer.sql
@@ -79,8 +79,8 @@ source_table.{{ column }},
 {% endfor %}
 {% for source_column in source_columns %}
 least(
-      ceil(
-          ({{ source_column }} - {{ source_column }}_aggregates.min_value )/ (( {{ source_column }}_aggregates.max_value - {{ source_column }}_aggregates.min_value ) / {{ n_bins }} )
+      floor(
+          cast({{ source_column }} - {{ source_column }}_aggregates.min_value as decimal)/ cast( {{ source_column }}_aggregates.max_value - {{ source_column }}_aggregates.min_value as decimal ) * {{ n_bins }} 
       ),
       {{ n_bins - 1 }}
   ) as {{ source_column }}_binned

diff --git a/macros/one_hot_encoder.sql b/macros/one_hot_encoder.sql
@@ -1,56 +1,102 @@
-{% macro one_hot_encoder(source_table,source_column,categories='auto',handle_unknown='ignore',include_columns='*') %}
-{%- if categories=='auto' -%}
-    {% set category_values_query %}
-    select distinct {{ source_column }} from {{ source_table }}
-    order by 1
-    {% endset %}
-    {% set results = run_query(category_values_query) %}
-    {% if execute %}
-        {# Return the first column #}
-        {% set category_values = results.columns[0].values() %}
-    {% else %}
-        {% set category_values = [] %}
-    {% endif %}
-{% elif categories is not iterable or categories is string or categories is mapping %}
-    {% set error_message %}
-The `categories` parameter must contain a list of category values.
-    {% endset %}
+{% macro one_hot_encoder(source_table, source_column, categories='auto', handle_unknown='error',include_columns='*', exclude_columns=none) %}
+
+    {%- if categories=='auto' -%}
+        {% set category_values_query %}
+            select distinct
+                {{ source_column }}
+            from
+                {{ source_table }}
+            order by 1
+        {% endset %}
+        {% set results = run_query(category_values_query) %}
+        {% if execute %}
+            {# Return the first column #}
+            {% set category_values = results.columns[0].values() %}
+        {% else %}
+            {% set category_values = [] %}
+        {% endif %}
+    {% elif categories is not iterable or categories is string or categories is mapping %}
+        {% set error_message %}
+    The `categories` parameter must contain a list of category values.
+        {% endset %}
+        {%- do exceptions.raise_compiler_error(error_message) -%}
+    {%- else -%}
+        {% set category_values = categories %}
+    {%- endif -%}
+
+    {%- if handle_unknown!='ignore' and handle_unknown!='error' -%}
+        {% set error_message %}
+    The 'handle_unknown' parameter requires a value of either 'ignore' (when unknown value occurs, all output columns are false) or 'error' (when unknown value occurs, raise an error).
+        {% endset %}
+        {%- do exceptions.raise_compiler_error(error_message) -%}
+    {%- endif -%}
+
+    {%- if include_columns!='*' and exclude_columns is not none -%}
+        {% set error_message %}
+    If the 'exclude_columns' parameter is set, providing 'include_columns' is invalid and must be left at its default value.
+        {% endset %}
+        {%- do exceptions.raise_compiler_error(error_message) -%}
+    {%- endif -%}
+
+    {%- if exclude_columns is not none and (exclude_columns is not iterable or exclude_columns is string or exclude_columns is mapping) -%}
+        {% set error_message %}
+    The 'exclude_columns' parameter value contain a list of column names.
+        {% endset %}
     {%- do exceptions.raise_compiler_error(error_message) -%}
-{%- else -%}
-    {% set category_values = categories %}
-{%- endif -%}
-{%- if handle_unknown!='ignore' -%}
-    {% set error_message %}
-The `one_hot_encoder` macro only supports an 'handle_unknown' value of 'ignore' at this time.
-    {% endset %}
+    {%- endif -%}
+
+    {%- if include_columns!='*' and (include_columns is not iterable or include_columns is string or include_columns is mapping) -%}
+        {% set error_message %}
+    The 'include_columns' parameter value must contain either the string '*' (for all columns in source), or a list of column names.
+        {% endset %}
     {%- do exceptions.raise_compiler_error(error_message) -%}
-{%- endif -%}
-{{ adapter.dispatch('one_hot_encoder',packages=['dbt_ml_preprocessing'])(source_table,source_column,category_values,handle_unknown,include_columns) }}
-{%- endmacro %}
+    {%- endif -%}
 
-{% macro snowflake__one_hot_encoder(source_table,source_column,category_values,handle_unknown,include_columns) %}
-select 
-{% for column in include_columns %}
-{{ source_table }}.{{ column }},
-{% endfor %}
-{% for category in category_values %}
-iff({{source_column}}='{{category}}',true,false) as {{source_column}}_{{category}}
-{% if not loop.last %}, {% endif %}
-{% endfor %}
-from {{ source_table }}
+    {{ adapter.dispatch('one_hot_encoder',packages=['dbt_ml_preprocessing'])(source_table, source_column, category_values, handle_unknown, include_columns, exclude_columns) }}
 {%- endmacro %}
 
-{% macro default__one_hot_encoder(source_table,source_column,category_values,handle_unknown,include_columns) %}
-select 
-{% for column in include_columns %}
-{{ column }},
-{% endfor %}
-{% for category in category_values %}
-case {{source_column}}
-    when '{{category}}' then true
-    else false
-    end as {{source_column}}_{{category}}
-{% if not loop.last %}, {% endif %}
-{% endfor %}
-from {{ source_table }}
+{% macro default__one_hot_encoder(source_table, source_column, category_values, handle_unknown, include_columns, exclude_columns) %}
+    {% set columns = adapter.get_columns_in_relation( source_table ) %}
+
+
+
+
+    with binary_output as (
+    select
+        {%- if include_columns=='*' and exclude_columns is none -%}
+            {% for column in columns %}
+                {{ column.name }},
+            {%- endfor -%}
+        {%- elif include_columns !='*'-%}
+            {% for column in include_columns %}
+                {{ source_table }}.{{ column }},
+            {%- endfor -%}
+        {%- else -%}
+            {% for column in columns %}
+            {%- if column.name | lower not in exclude_columns | lower %}
+                {{ column.name }},
+            {%- endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+        {% for category in category_values %}
+            {% set no_whitespace_column_name = category | replace( " ", "_") -%}
+                {%- if handle_unknown=='ignore' %}
+                    case 
+                        when {{ source_column }} = '{{ category }}' then true 
+                        else false
+                    end as is_{{ source_column }}_{{ no_whitespace_column_name }}
+                {% endif %}
+                {%- if handle_unknown=='error' %}
+                    case 
+                        when {{ source_column }} = '{{ category }}' then true 
+                        when {{ source_column }} in ('{{ category_values | join("','") }}') then false
+                        else cast('Error: unknown value found and handle_unknown parameter was "error"' as boolean)
+                    end as is_{{ source_column }}_{{ no_whitespace_column_name }}
+                {% endif %}
+            {%- if not loop.last %},{% endif -%}
+        {% endfor %}
+    from {{ source_table }}
+    )
+
+    select * from binary_output
 {%- endmacro %}
diff --git a/macros/one_hot_encoder.yml b/macros/one_hot_encoder.yml
@@ -5,7 +5,7 @@ macros:
     description: |
       Encode categorical features as a one-hot numeric array. See scikit-learn's [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder) for full documentation.
 
-      Will append a new boolean column for every category present in the data with the name &lt;source column&gt;_&lt;category value&gt;.
+      Will append a new boolean column for every category present in the data with the name is_&lt;source column&gt;_&lt;category value&gt;.
 
       Example usage:
       #### **`models\customer_features.yml:`**
@@ -15,7 +15,9 @@ macros:
       {{ '{{' }} dbt_ml_preprocessing.one_hot_encoder( ref('customer') ,'gender') {{ '}}' }}
 
       ```
-      Will produce a model named customer_features, with a new column named ```gender_encoded``` containing the encoded values.
+      If the column contained values 'male' and 'female, it will produce a model named customer_features with two new boolean columns named ```is_gender_male``` and ```is_gender_female```.
+      
+      Any spaces in the category values will be replaced with underscores, for ease of querying.
     arguments:
       - name: source_table
         type: string
@@ -26,17 +28,16 @@ macros:
       - name: include_columns
         type: string
         description: Other columns from the source table to be included in the model (defaults to '*' and brings all columns across)
+      - name: exclude_columns
+        type: string
+        description: A list of columns from the source table to be excluded in the model. Cannot be used in conjunction with 'include_columns'
       - name: categories
         type: string
         description: The categories of each feature determined during fitting. Defaults to 'auto', which will encode all values.
       - name: handle_unknown
         type: string
-        description: Whether to raise an error or ignore if an unknown categorical feature is present during transform. Only supports the default value of 'ignore' at this time.
+        description: Whether to raise an error or ignore if an unknown categorical feature is present during transform, defaults to 'error'. If 'ignore' is set and an unknown value is encountered, all output columns will be false.
 
   - name: default__one_hot_encoder
     docs:
       show: false
-
-  - name: snowflake__one_hot_encoder
-    docs:
-      show: false
diff --git a/macros/robust_scaler.sql b/macros/robust_scaler.sql
@@ -78,7 +78,7 @@ from
     {{ source_table }} as source_table
 {% endmacro %}
 
-{% macro redshift__robust_scaler(source_table,source_column,include_columns,with_centering,quantile_range) %}
+{% macro redshift__robust_scaler(source_table,source_columns,include_columns,with_centering,quantile_range) %}
 with 
 {% for source_column in source_columns %}
     {{ source_column }}_quartiles as(