Skip to content

Commit

Permalink
Merge branch 'transformer' of https://github.com/basf/mamba-tabular i…
Browse files Browse the repository at this point in the history
…nto transformer
  • Loading branch information
thielmaf committed May 28, 2024
2 parents 3c17fe1 + 0b222a4 commit f1bee61
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 3 deletions.
83 changes: 83 additions & 0 deletions mambular/utils/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
from .prepro_utils import OneHotFromOrdinal, CustomBinner, ContinuousOrdinalEncoder
from .prepro_utils import OneHotFromOrdinal, CustomBinner, ContinuousOrdinalEncoder
from sklearn.preprocessing import (
StandardScaler,
KBinsDiscretizer,
Expand All @@ -12,6 +13,7 @@
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from .ple_encoding import PLE
from .ple_encoding import PLE


class Preprocessor:
Expand All @@ -37,6 +39,10 @@ class Preprocessor:
'quantile', or other sklearn-compatible strategies.
task (str): Indicates the type of machine learning task ('regression' or 'classification'). This can
influence certain preprocessing behaviors, especially when using decision tree-based binning.
binning_strategy (str): Defines the strategy for binning numerical features. Options include 'uniform',
'quantile', or other sklearn-compatible strategies.
task (str): Indicates the type of machine learning task ('regression' or 'classification'). This can
influence certain preprocessing behaviors, especially when using decision tree-based binning.
Attributes:
column_transformer (ColumnTransformer): An instance of sklearn's ColumnTransformer that holds the
Expand All @@ -52,9 +58,21 @@ def __init__(
use_decision_tree_bins=False,
binning_strategy="uniform",
task="regression",
task="regression",
):
self.n_bins = n_bins
self.numerical_preprocessing = numerical_preprocessing.lower()
if self.numerical_preprocessing not in [
"ple",
"binning",
"one_hot",
"standardization",
"normalization",
]:
raise ValueError(
"Invalid numerical_preprocessing value. Supported values are 'ple', 'binning', 'one_hot', 'standardization', and 'normalization'."
)
self.numerical_preprocessing = numerical_preprocessing.lower()
if self.numerical_preprocessing not in [
"ple",
"binning",
Expand All @@ -70,6 +88,7 @@ def __init__(
self.fitted = False
self.binning_strategy = binning_strategy
self.task = task
self.task = task

def set_params(self, **params):
for key, value in params.items():
Expand Down Expand Up @@ -179,6 +198,12 @@ def fit(self, X, y=None):
("ple", PLE(n_bins=self.n_bins, task=self.task))
)

elif self.numerical_preprocessing == "ple":
numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
numeric_transformer_steps.append(
("ple", PLE(n_bins=self.n_bins, task=self.task))
)

numeric_transformer = Pipeline(numeric_transformer_steps)

transformers.append((f"num_{feature}", numeric_transformer, [feature]))
Expand Down Expand Up @@ -237,11 +262,17 @@ def transform(self, X):
Transforms the input data using the preconfigured column transformer and converts the output into a dictionary
format with keys corresponding to transformed feature names and values as arrays of transformed data.
This method converts the sparse or dense matrix returned by the column transformer into a more accessible
dictionary format, where each key-value pair represents a feature and its transformed data.
Transforms the input data using the preconfigured column transformer and converts the output into a dictionary
format with keys corresponding to transformed feature names and values as arrays of transformed data.
This method converts the sparse or dense matrix returned by the column transformer into a more accessible
dictionary format, where each key-value pair represents a feature and its transformed data.
Parameters:
X (DataFrame): The input data to be transformed.
X (DataFrame): The input data to be transformed.
Returns:
dict: A dictionary where keys are the names of the features (as per the transformations defined in the
Expand All @@ -253,6 +284,32 @@ def transform(self, X):
transformed_dict = self._split_transformed_output(X, transformed_X)
return transformed_dict

def _split_transformed_output(self, X, transformed_X):
"""
Splits the transformed data array into a dictionary where keys correspond to the original column names or
feature groups and values are the transformed data for those columns.
This helper method is utilized within `transform` to segregate the transformed data based on the
specification in the column transformer, assigning each transformed section to its corresponding feature name.
Parameters:
X (DataFrame): The original input data, used for determining shapes and transformations.
transformed_X (numpy array): The transformed data as a numpy array, outputted by the column transformer.
Returns:
dict: A dictionary mapping each transformation's name to its respective numpy array of transformed data.
The type of each array (int or float) is determined based on the type of transformation applied.
"""
start = 0
dict: A dictionary where keys are the names of the features (as per the transformations defined in the
column transformer) and the values are numpy arrays of the transformed data.
"""
transformed_X = self.column_transformer.transform(X)
# Now let's convert this into a dictionary of arrays, one per column
transformed_dict = self._split_transformed_output(X, transformed_X)
return transformed_dict
def _split_transformed_output(self, X, transformed_X):
"""
Splits the transformed data array into a dictionary where keys correspond to the original column names or
Expand Down Expand Up @@ -281,6 +338,16 @@ def _split_transformed_output(self, X, transformed_X):
dtype = int if "cat" in name else float
transformed_dict[name] = transformed_X[:, start:end].astype(dtype)
start = end
for (
name,
transformer,
columns,
) in self.column_transformer.transformers_: # skip 'remainder'
if transformer != "drop":
end = start + transformer.transform(X[[columns[0]]]).shape[1]
dtype = int if "cat" in name else float
transformed_dict[name] = transformed_X[:, start:end].astype(dtype)
start = end
return transformed_dict
Expand Down Expand Up @@ -308,6 +375,16 @@ def get_feature_info(self):
This method should only be called after the preprocessor has been fitted, as it relies on the structure and
configuration of the `column_transformer` attribute.

Raises:
RuntimeError: If the `column_transformer` is not yet fitted, indicating that the preprocessor must be
fitted before invoking this method.
Retrieves information about how features are encoded within the model's preprocessor.
This method identifies the type of encoding applied to each feature, categorizing them into binned or ordinal
encodings and other types of encodings (e.g., one-hot encoding after discretization).

This method should only be called after the preprocessor has been fitted, as it relies on the structure and
configuration of the `column_transformer` attribute.

Raises:
RuntimeError: If the `column_transformer` is not yet fitted, indicating that the preprocessor must be
fitted before invoking this method.
Expand All @@ -318,6 +395,11 @@ def get_feature_info(self):
processed using discretization or ordinal encoding.
- The second dictionary includes feature names with other encoding details, such as the dimension of
features after encoding transformations (e.g., one-hot encoding dimensions).
tuple of (dict, dict):
- The first dictionary maps feature names to their respective number of bins or categories if they are
processed using discretization or ordinal encoding.
- The second dictionary includes feature names with other encoding details, such as the dimension of
features after encoding transformations (e.g., one-hot encoding dimensions).
"""
binned_or_ordinal_info = {}
other_encoding_info = {}
Expand Down Expand Up @@ -373,6 +455,7 @@ def get_feature_info(self):
other_encoding_info[feature_name] = transformed_feature.shape[1]
print(
f"Feature: {feature_name} (Other Encoding), Encoded feature dimension: {transformed_feature.shape[1]}"
f"Feature: {feature_name} (Other Encoding), Encoded feature dimension: {transformed_feature.shape[1]}"
)

print("-" * 50)
Expand Down
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
include_package_data=True,
project_urls={"Homepage:": HOMEPAGE, "Documentation": DOCS},
url=HOMEPAGE,
python_requires=">=3.6, <=3.12.3",
install_requires=read_requirements(),
)

0 comments on commit f1bee61

Please sign in to comment.