From 0b222a4e70e21a5ebff8a07d68ec674effd77a5f Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Tue, 28 May 2024 07:29:03 +0000
Subject: [PATCH 01/21] test new preprocessing (ple)

---
 mambular/utils/mlp_utils.py    | 245 +++++++++++++++++++++++++++
 mambular/utils/ple_encoding.py | 156 ++++++++++++++++++
 mambular/utils/prepro_utils.py | 167 +++++++++++++++++++
 mambular/utils/preprocessor.py | 292 +++++++++------------------------
 setup.py                       |   2 +-
 tests/test_preprocessor.py     |   9 +-
 6 files changed, 658 insertions(+), 213 deletions(-)
 create mode 100644 mambular/utils/mlp_utils.py
 create mode 100644 mambular/utils/ple_encoding.py
 create mode 100644 mambular/utils/prepro_utils.py

diff --git a/mambular/utils/mlp_utils.py b/mambular/utils/mlp_utils.py
new file mode 100644
index 0000000..78dc209
--- /dev/null
+++ b/mambular/utils/mlp_utils.py
@@ -0,0 +1,245 @@
+import torch
+import torch.nn as nn
+
+
+class Linear_skip_block(nn.Module):
+    """
+    A neural network block that includes a linear layer, an activation function, a dropout layer, and optionally a
+    skip connection and batch normalization. The skip connection is added if the input and output feature sizes are equal.
+
+    Parameters
+    ----------
+    n_input : int
+        The number of input features.
+    n_output : int
+        The number of output features.
+    dropout_rate : float
+        The rate of dropout to apply for regularization.
+    activation_fn : torch.nn.modules.activation, optional
+        The activation function to use after the linear layer. Default is nn.LeakyReLU().
+    use_batch_norm : bool, optional
+        Whether to apply batch normalization after the activation function. Default is False.
+
+    Attributes
+    ----------
+    fc : torch.nn.Linear
+        The linear transformation layer.
+    act : torch.nn.Module
+        The activation function.
+    drop : torch.nn.Dropout
+        The dropout layer.
+    use_batch_norm : bool
+        Indicator of whether batch normalization is used.
+    batch_norm : torch.nn.BatchNorm1d, optional
+        The batch normalization layer, instantiated if use_batch_norm is True.
+    use_skip : bool
+        Indicator of whether a skip connection is used.
+    """
+
+    def __init__(
+        self,
+        n_input,
+        n_output,
+        dropout_rate,
+        activation_fn=nn.LeakyReLU(),
+        use_batch_norm=False,
+    ):
+        super(Linear_skip_block, self).__init__()
+
+        self.fc = nn.Linear(n_input, n_output)
+        self.act = activation_fn
+        self.drop = nn.Dropout(dropout_rate)
+        self.use_batch_norm = use_batch_norm
+        self.use_skip = (
+            n_input == n_output
+        )  # Only use skip connection if input and output sizes are equal
+
+        if use_batch_norm:
+            self.batch_norm = nn.BatchNorm1d(n_output)  # Initialize batch normalization
+
+    def forward(self, x):
+        """
+        Defines the forward pass of the Linear_block.
+
+        Parameters
+        ----------
+        x : Tensor
+            The input tensor to the block.
+
+        Returns
+        -------
+        Tensor
+            The output tensor after processing through the linear layer, activation function, dropout,
+            and optional batch normalization.
+        """
+        x0 = x  # Save input for possible skip connection
+        x = self.fc(x)
+        x = self.act(x)
+
+        if self.use_batch_norm:
+            x = self.batch_norm(x)  # Apply batch normalization after activation
+
+        if self.use_skip:
+            x = x + x0  # Add skip connection if applicable
+
+        x = self.drop(x)  # Apply dropout
+        return x
+
+
+class Linear_block(nn.Module):
+    """
+    A neural network block that includes a linear layer, an activation function, a dropout layer, and optionally batch normalization.
+
+    Parameters
+    ----------
+    n_input : int
+        The number of input features.
+    n_output : int
+        The number of output features.
+    dropout_rate : float
+        The rate of dropout to apply.
+    activation_fn : torch.nn.modules.activation, optional
+        The activation function to use after the linear layer. Default is nn.LeakyReLU().
+    batch_norm : bool, optional
+        Whether to include batch normalization after the activation function. Default is False.
+
+    Attributes
+    ----------
+    block : torch.nn.Sequential
+        A sequential container holding the linear layer, activation function, dropout, and optionally batch normalization.
+    """
+
+    def __init__(
+        self,
+        n_input,
+        n_output,
+        dropout_rate,
+        activation_fn=nn.LeakyReLU(),
+        batch_norm=False,
+    ):
+        super(Linear_block, self).__init__()
+
+        # Initialize modules
+        modules = [
+            nn.Linear(n_input, n_output),
+            activation_fn,
+            nn.Dropout(dropout_rate),
+        ]
+
+        # Optionally add batch normalization
+        if batch_norm:
+            modules.append(nn.BatchNorm1d(n_output))
+
+        # Create the sequential model
+        self.block = nn.Sequential(*modules)
+
+    def forward(self, x):
+        """
+        Defines the forward pass of the Linear_block.
+
+        Parameters
+        ----------
+        x : Tensor
+            The input tensor to the block.
+
+        Returns
+        -------
+        Tensor
+            The output tensor after processing through the linear layer, activation function, dropout,
+            and optional batch normalization.
+        """
+        # Pass the input through the block
+        return self.block(x)
+
+
+class MLP(nn.Module):
+    """
+    A multi-layer perceptron (MLP) for regression tasks, configurable with optional skip connections and batch normalization.
+
+    Parameters
+    ----------
+    n_input_units : int
+        The number of units in the input layer.
+    hidden_units_list : list of int
+        A list specifying the number of units in each hidden layer.
+    n_output_units : int
+        The number of units in the output layer.
+    dropout_rate : float
+        The dropout rate used across the MLP.
+    use_skip_layers : bool, optional
+        Whether to use skip connections in layers where input and output sizes match. Default is False.
+    activation_fn : torch.nn.modules.activation, optional
+        The activation function used across the layers. Default is nn.LeakyReLU().
+    use_batch_norm : bool, optional
+        Whether to apply batch normalization in each layer. Default is False.
+
+    Attributes
+    ----------
+    hidden_layers : torch.nn.Sequential
+        Sequential container of layers comprising the MLP's hidden layers.
+    linear_final : torch.nn.Linear
+        The final linear layer of the MLP.
+    """
+
+    def __init__(
+        self,
+        n_input_units,
+        hidden_units_list=[64, 32, 32],
+        n_output_units: int = 1,
+        dropout_rate: float = 0.1,
+        use_skip_layers: bool = False,
+        activation_fn=nn.LeakyReLU(),
+        use_batch_norm: bool = False,
+    ):
+        super(MLP, self).__init__()
+        self.n_input_units = n_input_units
+        self.hidden_units_list = hidden_units_list
+        self.dropout_rate = dropout_rate
+        self.n_output_units = n_output_units
+
+        layers = []
+        input_units = n_input_units
+
+        for n_hidden_units in hidden_units_list:
+            if use_skip_layers and input_units == n_hidden_units:
+                layers.append(
+                    Linear_skip_block(
+                        input_units,
+                        n_hidden_units,
+                        dropout_rate,
+                        activation_fn,
+                        use_batch_norm,
+                    )
+                )
+            else:
+                layers.append(
+                    Linear_block(
+                        input_units,
+                        n_hidden_units,
+                        dropout_rate,
+                        activation_fn,
+                        use_batch_norm,
+                    )
+                )
+            input_units = n_hidden_units  # Update input_units for the next layer
+
+        self.hidden_layers = nn.Sequential(*layers)
+        self.linear_final = nn.Linear(input_units, n_output_units)  # Final layer
+
+    def forward(self, x):
+        """
+        Defines the forward pass of the MLP.
+
+        Parameters
+        ----------
+        x : Tensor
+            The input tensor to the MLP.
+
+        Returns
+        -------
+        Tensor
+            The output predictions of the model for regression tasks.
+        """
+        x = self.hidden_layers(x)
+        x = self.linear_final(x)
+        return x
diff --git a/mambular/utils/ple_encoding.py b/mambular/utils/ple_encoding.py
new file mode 100644
index 0000000..972b36b
--- /dev/null
+++ b/mambular/utils/ple_encoding.py
@@ -0,0 +1,156 @@
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
+import bisect
+import re
+from sklearn.tree import _tree
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+import pandas as pd
+import numpy as np
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
+
+
+def tree_to_code(tree, feature_names):
+    """
+    Convert a scikit-learn decision tree into a list of conditions.
+
+    Args:
+        tree (sklearn.tree.DecisionTreeRegressor or sklearn.tree.DecisionTreeClassifier):
+            The decision tree model to be converted.
+        feature_names (list of str): The names of the features used in the tree.
+        Y (array-like): The target values associated with the tree.
+
+    Returns:
+        list of str: A list of conditions representing the decision tree paths.
+
+    Example:
+        # Convert a decision tree into a list of conditions
+        tree_conditions = tree_to_code(tree_model, feature_names, target_values)
+    """
+
+    tree_ = tree.tree_
+    feature_name = [
+        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
+        for i in tree_.feature
+    ]
+
+    pathto = dict()
+    my_list = []
+
+    global k
+    k = 0
+
+    def recurse(node, depth, parent):
+        global k
+        indent = "  " * depth
+
+        if tree_.feature[node] != _tree.TREE_UNDEFINED:
+            # name = df_name + "[" + "'" + feature_name[node]+ "'" + "]"
+            name = feature_name[node]
+            threshold = tree_.threshold[node]
+            s = "{} <= {} ".format(name, threshold, node)
+            if node == 0:
+                pathto[node] = "(" + s + ")"
+            else:
+                pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")"
+
+            recurse(tree_.children_left[node], depth + 1, node)
+            s = "{} > {}".format(name, threshold)
+            if node == 0:
+                pathto[node] = s
+            else:
+                pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")"
+            recurse(tree_.children_right[node], depth + 1, node)
+        else:
+            k = k + 1
+            my_list.append(pathto[parent])
+            # print(k,')',pathto[parent], tree_.value[node])
+
+    recurse(0, 1, 0)
+
+    return my_list
+
+
+class PLE(BaseEstimator, TransformerMixin):
+    def __init__(
+        self, n_bins=20, tree_params={}, task="regression", conditions=None, **kwargs
+    ):
+        super(PLE, self).__init__(**kwargs)
+
+        self.task = task
+        self.tree_params = tree_params
+        self.n_bins = n_bins
+        self.conditions = conditions
+        self.pattern = (
+            r"-?\d+\.?\d*[eE]?[+-]?\d*"  # This pattern matches integers and floats
+        )
+
+    def fit(self, feature, target):
+        if self.task == "regression":
+            dt = DecisionTreeRegressor(max_leaf_nodes=self.n_bins)
+        elif self.task == "classification":
+            dt = DecisionTreeClassifier(max_leaf_nodes=self.n_bins)
+        else:
+            raise ValueError("This task is not supported")
+
+        dt.fit(feature, target)
+
+        self.conditions = tree_to_code(dt, ["feature"])
+        return self
+
+    def transform(self, feature):
+        if feature.shape == (feature.shape[0], 1):
+            feature = np.squeeze(feature, axis=1)
+        else:
+            feature = feature
+        result_list = []
+        for idx, cond in enumerate(self.conditions):
+            result_list.append(eval(cond) * (idx + 1))
+
+        encoded_feature = np.expand_dims(np.sum(np.stack(result_list).T, axis=1), 1)
+
+        encoded_feature = np.array(encoded_feature - 1, dtype=np.int64)
+
+        # Initialize an empty list to store the extracted numbers
+        locations = []
+        # Iterate through the strings and extract numbers
+        for string in self.conditions:
+            matches = re.findall(self.pattern, string)
+            locations.extend(matches)
+
+        locations = [float(number) for number in locations]
+        locations = list(set(locations))
+        locations = np.sort(locations)
+
+        ple_encoded_feature = np.zeros((len(feature), locations.shape[0] + 1))
+        if locations[-1] > np.max(feature):
+            locations[-1] = np.max(feature)
+
+        for idx in range(len(encoded_feature)):
+            if feature[idx] >= locations[-1]:
+                ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx]
+                ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1
+            elif feature[idx] <= locations[0]:
+                ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx]
+
+            else:
+                ple_encoded_feature[idx][encoded_feature[idx]] = (
+                    feature[idx] - locations[(encoded_feature[idx] - 1)[0]]
+                ) / (
+                    locations[(encoded_feature[idx])[0]]
+                    - locations[(encoded_feature[idx] - 1)[0]]
+                )
+
+                ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1
+
+        if ple_encoded_feature.shape[1] == 1:
+            return np.zeros([len(feature), self.n_bins])
+
+        else:
+            return np.array(ple_encoded_feature, dtype=np.float32)
+
+    def get_feature_names_out(self, input_features=None):
+        if input_features is None:
+            raise ValueError("input_features must be specified")
+        return input_features
diff --git a/mambular/utils/prepro_utils.py b/mambular/utils/prepro_utils.py
new file mode 100644
index 0000000..4bb9fa7
--- /dev/null
+++ b/mambular/utils/prepro_utils.py
@@ -0,0 +1,167 @@
+import pandas as pd
+import numpy as np
+from sklearn.base import TransformerMixin, BaseEstimator
+
+
+class CustomBinner(TransformerMixin):
+    def __init__(self, bins):
+        # bins can be a scalar (number of bins) or array-like (bin edges)
+        self.bins = bins
+
+    def fit(self, X, y=None):
+        # Fit doesn't need to do anything as we are directly using provided bins
+        return self
+
+    def transform(self, X):
+        if isinstance(self.bins, int):
+            # Calculate equal width bins based on the range of the data and number of bins
+            _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True)
+        else:
+            # Use predefined bins
+            bins = self.bins
+
+        # Apply the bins to the data
+        binned_data = pd.cut(
+            X.squeeze(),
+            bins=np.sort(np.unique(bins)),
+            labels=False,
+            include_lowest=True,
+        )
+        print(binned_data)
+        return np.expand_dims(np.array(binned_data), 1)
+
+
+class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin):
+    """
+    This encoder converts categorical features into continuous integer values. Each unique category within a feature
+    is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for
+    models that can only handle continuous data.
+
+    Attributes:
+        mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers
+                                  for a single feature.
+
+    Methods:
+        fit(X, y=None): Learns the mapping from original categories to integers.
+        transform(X): Applies the learned mapping to the data.
+        get_feature_names_out(input_features=None): Returns the input features after transformation.
+    """
+
+    def fit(self, X, y=None):
+        """
+        Learns the mapping from original categories to integers for each feature.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to fit.
+            y (ignored): Not used, present for API consistency by convention.
+
+        Returns:
+            self: Returns the instance itself.
+        """
+        # Fit should determine the mapping from original categories to sequential integers starting from 0
+        self.mapping_ = [
+            {category: i for i, category in enumerate(np.unique(col))} for col in X.T
+        ]
+        return self
+
+    def transform(self, X):
+        """
+        Transforms the categories in X to their corresponding integer values based on the learned mapping.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to transform.
+
+        Returns:
+            X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values.
+        """
+        # Transform the categories to their mapped integer values
+        X_transformed = np.array(
+            [
+                [self.mapping_[col].get(value, -1) for col, value in enumerate(row)]
+                for row in X
+            ]
+        )
+        return X_transformed
+
+    def get_feature_names_out(self, input_features=None):
+        """
+        Returns the names of the transformed features.
+
+        Parameters:
+            input_features (list of str): The names of the input features.
+
+        Returns:
+            input_features (array of shape (n_features,)): The names of the output features after transformation.
+        """
+        if input_features is None:
+            raise ValueError("input_features must be specified")
+        return input_features
+
+
+class OneHotFromOrdinal(TransformerMixin, BaseEstimator):
+    """
+    A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful
+    in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required
+    for model training.
+
+    Attributes:
+        max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature,
+                                                    determining the size of the one-hot encoded array for that feature.
+
+    Methods:
+        fit(X, y=None): Learns the maximum bin index for each feature.
+        transform(X): Converts ordinal-encoded features into one-hot format.
+        get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding.
+    """
+
+    def fit(self, X, y=None):
+        """
+        Learns the maximum bin index for each feature from the data.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features.
+            y (ignored): Not used, present for API consistency by convention.
+
+        Returns:
+            self: Returns the instance itself.
+        """
+        self.max_bins_ = (
+            np.max(X, axis=0).astype(int) + 1
+        )  # Find the maximum bin index for each feature
+        return self
+
+    def transform(self, X):
+        """
+        Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features.
+
+        Returns:
+            X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features.
+        """
+        # Initialize an empty list to hold the one-hot encoded arrays
+        one_hot_encoded = []
+        for i, max_bins in enumerate(self.max_bins_):
+            # Convert each feature to one-hot using its max_bins
+            feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)]
+            one_hot_encoded.append(feature_one_hot)
+        # Concatenate the one-hot encoded features horizontally
+        return np.hstack(one_hot_encoded)
+
+    def get_feature_names_out(self, input_features=None):
+        """
+        Generates feature names for the one-hot encoded features based on the input feature names and the number of bins.
+
+        Parameters:
+            input_features (list of str): The names of the input features that were ordinal-encoded.
+
+        Returns:
+            feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features.
+        """
+        feature_names = []
+        for i, max_bins in enumerate(self.max_bins_):
+            feature_names.extend(
+                [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))]
+            )
+        return np.array(feature_names)
diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py
index 7861ba0..082bbec 100644
--- a/mambular/utils/preprocessor.py
+++ b/mambular/utils/preprocessor.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import numpy as np
-from sklearn.base import TransformerMixin, BaseEstimator
+from .prepro_utils import OneHotFromOrdinal, CustomBinner, ContinuousOrdinalEncoder
 from sklearn.preprocessing import (
     StandardScaler,
     KBinsDiscretizer,
@@ -10,171 +10,7 @@
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
-from sklearn.exceptions import NotFittedError
-
-
-class CustomBinner(TransformerMixin):
-    def __init__(self, bins):
-        # bins can be a scalar (number of bins) or array-like (bin edges)
-        self.bins = bins
-
-    def fit(self, X, y=None):
-        # Fit doesn't need to do anything as we are directly using provided bins
-        return self
-
-    def transform(self, X):
-        if isinstance(self.bins, int):
-            # Calculate equal width bins based on the range of the data and number of bins
-            _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True)
-        else:
-            # Use predefined bins
-            bins = self.bins
-
-        # Apply the bins to the data
-        binned_data = pd.cut(
-            X.squeeze(),
-            bins=np.sort(np.unique(bins)),
-            labels=False,
-            include_lowest=True,
-        )
-        print(binned_data)
-        return np.expand_dims(np.array(binned_data), 1)
-
-
-class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin):
-    """
-    This encoder converts categorical features into continuous integer values. Each unique category within a feature
-    is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for
-    models that can only handle continuous data.
-
-    Attributes:
-        mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers
-                                  for a single feature.
-
-    Methods:
-        fit(X, y=None): Learns the mapping from original categories to integers.
-        transform(X): Applies the learned mapping to the data.
-        get_feature_names_out(input_features=None): Returns the input features after transformation.
-    """
-
-    def fit(self, X, y=None):
-        """
-        Learns the mapping from original categories to integers for each feature.
-
-        Parameters:
-            X (array-like of shape (n_samples, n_features)): The input data to fit.
-            y (ignored): Not used, present for API consistency by convention.
-
-        Returns:
-            self: Returns the instance itself.
-        """
-        # Fit should determine the mapping from original categories to sequential integers starting from 0
-        self.mapping_ = [
-            {category: i for i, category in enumerate(np.unique(col))} for col in X.T
-        ]
-        return self
-
-    def transform(self, X):
-        """
-        Transforms the categories in X to their corresponding integer values based on the learned mapping.
-
-        Parameters:
-            X (array-like of shape (n_samples, n_features)): The input data to transform.
-
-        Returns:
-            X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values.
-        """
-        # Transform the categories to their mapped integer values
-        X_transformed = np.array(
-            [
-                [self.mapping_[col].get(value, -1) for col, value in enumerate(row)]
-                for row in X
-            ]
-        )
-        return X_transformed
-
-    def get_feature_names_out(self, input_features=None):
-        """
-        Returns the names of the transformed features.
-
-        Parameters:
-            input_features (list of str): The names of the input features.
-
-        Returns:
-            input_features (array of shape (n_features,)): The names of the output features after transformation.
-        """
-        if input_features is None:
-            raise ValueError("input_features must be specified")
-        return input_features
-
-
-class OneHotFromOrdinal(TransformerMixin, BaseEstimator):
-    """
-    A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful
-    in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required
-    for model training.
-
-    Attributes:
-        max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature,
-                                                    determining the size of the one-hot encoded array for that feature.
-
-    Methods:
-        fit(X, y=None): Learns the maximum bin index for each feature.
-        transform(X): Converts ordinal-encoded features into one-hot format.
-        get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding.
-    """
-
-    def fit(self, X, y=None):
-        """
-        Learns the maximum bin index for each feature from the data.
-
-        Parameters:
-            X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features.
-            y (ignored): Not used, present for API consistency by convention.
-
-        Returns:
-            self: Returns the instance itself.
-        """
-        self.max_bins_ = (
-            np.max(X, axis=0).astype(int) + 1
-        )  # Find the maximum bin index for each feature
-        return self
-
-    def transform(self, X):
-        """
-        Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting.
-
-        Parameters:
-            X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features.
-
-        Returns:
-            X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features.
-        """
-        # Initialize an empty list to hold the one-hot encoded arrays
-        one_hot_encoded = []
-        for i, max_bins in enumerate(self.max_bins_):
-            # Convert each feature to one-hot using its max_bins
-            feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)]
-            one_hot_encoded.append(feature_one_hot)
-        # Concatenate the one-hot encoded features horizontally
-        return np.hstack(one_hot_encoded)
-
-    def get_feature_names_out(self, input_features=None):
-        """
-        Generates feature names for the one-hot encoded features based on the input feature names and the number of bins.
-
-        Parameters:
-            input_features (list of str): The names of the input features that were ordinal-encoded.
-
-        Returns:
-            feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features.
-        """
-        feature_names = []
-        for i, max_bins in enumerate(self.max_bins_):
-            feature_names.extend(
-                [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))]
-            )
-        return np.array(feature_names)
+from .ple_encoding import PLE
 
 
 class Preprocessor:
@@ -195,10 +31,15 @@ class Preprocessor:
         use_decision_tree_bins (bool): If True, uses decision tree regression/classification to determine
                                        optimal bin edges for numerical feature binning. This parameter is
                                        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+        binning_strategy (str): Defines the strategy for binning numerical features. Options include 'uniform',
+                                'quantile', or other sklearn-compatible strategies.
+        task (str): Indicates the type of machine learning task ('regression' or 'classification'). This can
+                    influence certain preprocessing behaviors, especially when using decision tree-based binning.
 
     Attributes:
-        column_transformer (ColumnTransformer): A sklearn ColumnTransformer instance that holds the configured
-                                                preprocessing pipelines for the different feature types.
+        column_transformer (ColumnTransformer): An instance of sklearn's ColumnTransformer that holds the
+                                                configured preprocessing pipelines for different feature types.
+        fitted (bool): Indicates whether the preprocessor has been fitted to the data.
 
     Methods:
         fit(X, y=None): Fits the preprocessor to the data, identifying feature types and configuring the
@@ -215,13 +56,25 @@ def __init__(
         numerical_preprocessing="binning",
         use_decision_tree_bins=False,
         binning_strategy="uniform",
+        task="regression",
     ):
         self.n_bins = n_bins
-        self.numerical_preprocessing = numerical_preprocessing
+        self.numerical_preprocessing = numerical_preprocessing.lower()
+        if self.numerical_preprocessing not in [
+            "ple",
+            "binning",
+            "one_hot",
+            "standardization",
+            "normalization",
+        ]:
+            raise ValueError(
+                "Invalid numerical_preprocessing value. Supported values are 'ple', 'binning', 'one_hot', 'standardization', and 'normalization'."
+            )
         self.use_decision_tree_bins = use_decision_tree_bins
         self.column_transformer = None
         self.fitted = False
         self.binning_strategy = binning_strategy
+        self.task = task
 
     def set_params(self, **params):
         for key, value in params.items():
@@ -325,6 +178,12 @@ def fit(self, X, y=None):
                 elif self.numerical_preprocessing == "normalization":
                     numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
 
+                elif self.numerical_preprocessing == "ple":
+                    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
+                    numeric_transformer_steps.append(
+                        ("ple", PLE(n_bins=self.n_bins, task=self.task))
+                    )
+
                 numeric_transformer = Pipeline(numeric_transformer_steps)
 
                 transformers.append((f"num_{feature}", numeric_transformer, [feature]))
@@ -380,53 +239,53 @@ def _get_decision_tree_bins(self, X, y, numerical_features):
 
     def transform(self, X):
         """
-        Transforms the dataset using the fitted preprocessing pipelines. This method applies the transformations set up during the fitting process
-        to the input data and returns a dictionary with the transformed data.
+        Transforms the input data using the preconfigured column transformer and converts the output into a dictionary
+        format with keys corresponding to transformed feature names and values as arrays of transformed data.
+
+        This method converts the sparse or dense matrix returned by the column transformer into a more accessible
+        dictionary format, where each key-value pair represents a feature and its transformed data.
 
         Parameters:
-            X (DataFrame or dict): The input dataset to be transformed.
+            X (DataFrame): The input data to be transformed.
 
         Returns:
-            dict: A dictionary where keys are the base feature names and values are the transformed features as arrays.
+            dict: A dictionary where keys are the names of the features (as per the transformations defined in the
+            column transformer) and the values are numpy arrays of the transformed data.
         """
-        if not self.fitted:
-            raise NotFittedError(
-                "This Preprocessor instance is not fitted yet. Call 'fit' with appropriate arguments before using this method."
-            )
-
-        if isinstance(X, dict):
-            X = pd.DataFrame(X)
-
-        # Transform X using the column transformer
-        transformed_X = self.column_transformer.transform(
-            X
-        )  # To understand the shape of the transformed data
+        transformed_X = self.column_transformer.transform(X)
 
-        # Initialize the transformed dictionary
-        transformed_dict = {}
+        # Now let's convert this into a dictionary of arrays, one per column
+        transformed_dict = self._split_transformed_output(X, transformed_X)
+        return transformed_dict
 
-        # Retrieve output feature names from the column transformer
-        output_features = self.column_transformer.get_feature_names_out()
+    def _split_transformed_output(self, X, transformed_X):
+        """
+        Splits the transformed data array into a dictionary where keys correspond to the original column names or
+        feature groups and values are the transformed data for those columns.
 
-        # Iterate over each output feature name to populate the transformed_dict
-        for i, col in enumerate(output_features):
-            # Extract the base feature name (before any transformation)
-            base_feature = col.split("__")[0]
+        This helper method is utilized within `transform` to segregate the transformed data based on the
+        specification in the column transformer, assigning each transformed section to its corresponding feature name.
 
-            # If the base feature name already exists in the dictionary, append the new data
-            if base_feature in transformed_dict:
-                transformed_dict[base_feature] = np.vstack(
-                    [transformed_dict[base_feature], transformed_X[:, i]]
-                )
-            else:
-                # Otherwise, create a new entry in the dictionary
-                transformed_dict[base_feature] = transformed_X[:, i]
+        Parameters:
+            X (DataFrame): The original input data, used for determining shapes and transformations.
+            transformed_X (numpy array): The transformed data as a numpy array, outputted by the column transformer.
 
-        # Ensure all arrays in the dictionary are the correct shape
-        for key in transformed_dict.keys():
-            transformed_dict[key] = (
-                transformed_dict[key].reshape(-1, transformed_X.shape[0]).T
-            )
+        Returns:
+            dict: A dictionary mapping each transformation's name to its respective numpy array of transformed data.
+            The type of each array (int or float) is determined based on the type of transformation applied.
+        """
+        start = 0
+        transformed_dict = {}
+        for (
+            name,
+            transformer,
+            columns,
+        ) in self.column_transformer.transformers_:  # skip 'remainder'
+            if transformer != "drop":
+                end = start + transformer.transform(X[[columns[0]]]).shape[1]
+                dtype = int if "cat" in name else float
+                transformed_dict[name] = transformed_X[:, start:end].astype(dtype)
+                start = end
 
         return transformed_dict
 
@@ -447,12 +306,23 @@ def fit_transform(self, X, y=None):
 
     def get_feature_info(self):
         """
-        Returns detailed information about the processed features, including the number of bins for binned features
-        and the dimensionality of encoded features. This method is useful for understanding the transformations applied to each feature.
+        Retrieves information about how features are encoded within the model's preprocessor.
+        This method identifies the type of encoding applied to each feature, categorizing them into binned or ordinal
+        encodings and other types of encodings (e.g., one-hot encoding after discretization).
+
+        This method should only be called after the preprocessor has been fitted, as it relies on the structure and
+        configuration of the `column_transformer` attribute.
+
+        Raises:
+            RuntimeError: If the `column_transformer` is not yet fitted, indicating that the preprocessor must be
+            fitted before invoking this method.
 
         Returns:
-            tuple: A tuple containing two dictionaries, the first with information about binned or ordinal encoded features and
-                   the second with information about other encoded features.
+            tuple of (dict, dict):
+                - The first dictionary maps feature names to their respective number of bins or categories if they are
+                  processed using discretization or ordinal encoding.
+                - The second dictionary includes feature names with other encoding details, such as the dimension of
+                  features after encoding transformations (e.g., one-hot encoding dimensions).
         """
         binned_or_ordinal_info = {}
         other_encoding_info = {}
@@ -507,7 +377,7 @@ def get_feature_info(self):
                         )
                         other_encoding_info[feature_name] = transformed_feature.shape[1]
                         print(
-                            f"Feature: {feature_name} ({self.numerical_preprocessing}), Encoded feature dimension: {transformed_feature.shape[1]}"
+                            f"Feature: {feature_name} (Other Encoding), Encoded feature dimension: {transformed_feature.shape[1]}"
                         )
 
                 print("-" * 50)
diff --git a/setup.py b/setup.py
index c6ae326..a7ee80f 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,6 @@ def read_requirements():
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
-    python_requires=">=3.6, <3.11",
+    python_requires=">=3.6, <=3.12.3",
     install_requires=read_requirements(),
 )
diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py
index bdcc26b..fb43c64 100644
--- a/tests/test_preprocessor.py
+++ b/tests/test_preprocessor.py
@@ -41,7 +41,14 @@ def test_fit_transform(self):
         """Test fitting and transforming the data."""
         pp = Preprocessor(numerical_preprocessing="standardization")
         transformed_data = pp.fit_transform(self.data)
-        print(transformed_data)
+        self.assertIsInstance(transformed_data, dict)
+        self.assertTrue("num_numerical" in transformed_data)
+        self.assertTrue("cat_categorical" in transformed_data)
+
+    def test_ple(self):
+        """Test fitting and transforming the data."""
+        pp = Preprocessor(numerical_preprocessing="ple", n_bins=20)
+        transformed_data = pp.fit_transform(self.data)
         self.assertIsInstance(transformed_data, dict)
         self.assertTrue("num_numerical" in transformed_data)
         self.assertTrue("cat_categorical" in transformed_data)

From 3c17fe1f0becae3fbccc56de1dd55ddfe755669b Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Tue, 28 May 2024 07:29:03 +0000
Subject: [PATCH 02/21] test new preprocessing (ple)

---
 mambular/utils/mlp_utils.py    | 245 ++++++++++++++++++++++++
 mambular/utils/ple_encoding.py | 156 +++++++++++++++
 mambular/utils/prepro_utils.py | 167 ++++++++++++++++
 mambular/utils/preprocessor.py | 335 ++++++++++-----------------------
 setup.py                       |  55 +++---
 tests/test_preprocessor.py     |   9 +-
 6 files changed, 703 insertions(+), 264 deletions(-)
 create mode 100644 mambular/utils/mlp_utils.py
 create mode 100644 mambular/utils/ple_encoding.py
 create mode 100644 mambular/utils/prepro_utils.py

diff --git a/mambular/utils/mlp_utils.py b/mambular/utils/mlp_utils.py
new file mode 100644
index 0000000..78dc209
--- /dev/null
+++ b/mambular/utils/mlp_utils.py
@@ -0,0 +1,245 @@
+import torch
+import torch.nn as nn
+
+
+class Linear_skip_block(nn.Module):
+    """
+    A neural network block that includes a linear layer, an activation function, a dropout layer, and optionally a
+    skip connection and batch normalization. The skip connection is added if the input and output feature sizes are equal.
+
+    Parameters
+    ----------
+    n_input : int
+        The number of input features.
+    n_output : int
+        The number of output features.
+    dropout_rate : float
+        The rate of dropout to apply for regularization.
+    activation_fn : torch.nn.modules.activation, optional
+        The activation function to use after the linear layer. Default is nn.LeakyReLU().
+    use_batch_norm : bool, optional
+        Whether to apply batch normalization after the activation function. Default is False.
+
+    Attributes
+    ----------
+    fc : torch.nn.Linear
+        The linear transformation layer.
+    act : torch.nn.Module
+        The activation function.
+    drop : torch.nn.Dropout
+        The dropout layer.
+    use_batch_norm : bool
+        Indicator of whether batch normalization is used.
+    batch_norm : torch.nn.BatchNorm1d, optional
+        The batch normalization layer, instantiated if use_batch_norm is True.
+    use_skip : bool
+        Indicator of whether a skip connection is used.
+    """
+
+    def __init__(
+        self,
+        n_input,
+        n_output,
+        dropout_rate,
+        activation_fn=nn.LeakyReLU(),
+        use_batch_norm=False,
+    ):
+        super(Linear_skip_block, self).__init__()
+
+        self.fc = nn.Linear(n_input, n_output)
+        self.act = activation_fn
+        self.drop = nn.Dropout(dropout_rate)
+        self.use_batch_norm = use_batch_norm
+        self.use_skip = (
+            n_input == n_output
+        )  # Only use skip connection if input and output sizes are equal
+
+        if use_batch_norm:
+            self.batch_norm = nn.BatchNorm1d(n_output)  # Initialize batch normalization
+
+    def forward(self, x):
+        """
+        Defines the forward pass of the Linear_block.
+
+        Parameters
+        ----------
+        x : Tensor
+            The input tensor to the block.
+
+        Returns
+        -------
+        Tensor
+            The output tensor after processing through the linear layer, activation function, dropout,
+            and optional batch normalization.
+        """
+        x0 = x  # Save input for possible skip connection
+        x = self.fc(x)
+        x = self.act(x)
+
+        if self.use_batch_norm:
+            x = self.batch_norm(x)  # Apply batch normalization after activation
+
+        if self.use_skip:
+            x = x + x0  # Add skip connection if applicable
+
+        x = self.drop(x)  # Apply dropout
+        return x
+
+
+class Linear_block(nn.Module):
+    """
+    A neural network block that includes a linear layer, an activation function, a dropout layer, and optionally batch normalization.
+
+    Parameters
+    ----------
+    n_input : int
+        The number of input features.
+    n_output : int
+        The number of output features.
+    dropout_rate : float
+        The rate of dropout to apply.
+    activation_fn : torch.nn.modules.activation, optional
+        The activation function to use after the linear layer. Default is nn.LeakyReLU().
+    batch_norm : bool, optional
+        Whether to include batch normalization after the activation function. Default is False.
+
+    Attributes
+    ----------
+    block : torch.nn.Sequential
+        A sequential container holding the linear layer, activation function, dropout, and optionally batch normalization.
+    """
+
+    def __init__(
+        self,
+        n_input,
+        n_output,
+        dropout_rate,
+        activation_fn=nn.LeakyReLU(),
+        batch_norm=False,
+    ):
+        super(Linear_block, self).__init__()
+
+        # Initialize modules
+        modules = [
+            nn.Linear(n_input, n_output),
+            activation_fn,
+            nn.Dropout(dropout_rate),
+        ]
+
+        # Optionally add batch normalization
+        if batch_norm:
+            modules.append(nn.BatchNorm1d(n_output))
+
+        # Create the sequential model
+        self.block = nn.Sequential(*modules)
+
+    def forward(self, x):
+        """
+        Defines the forward pass of the Linear_block.
+
+        Parameters
+        ----------
+        x : Tensor
+            The input tensor to the block.
+
+        Returns
+        -------
+        Tensor
+            The output tensor after processing through the linear layer, activation function, dropout,
+            and optional batch normalization.
+        """
+        # Pass the input through the block
+        return self.block(x)
+
+
+class MLP(nn.Module):
+    """
+    A multi-layer perceptron (MLP) for regression tasks, configurable with optional skip connections and batch normalization.
+
+    Parameters
+    ----------
+    n_input_units : int
+        The number of units in the input layer.
+    hidden_units_list : list of int
+        A list specifying the number of units in each hidden layer.
+    n_output_units : int
+        The number of units in the output layer.
+    dropout_rate : float
+        The dropout rate used across the MLP.
+    use_skip_layers : bool, optional
+        Whether to use skip connections in layers where input and output sizes match. Default is False.
+    activation_fn : torch.nn.modules.activation, optional
+        The activation function used across the layers. Default is nn.LeakyReLU().
+    use_batch_norm : bool, optional
+        Whether to apply batch normalization in each layer. Default is False.
+
+    Attributes
+    ----------
+    hidden_layers : torch.nn.Sequential
+        Sequential container of layers comprising the MLP's hidden layers.
+    linear_final : torch.nn.Linear
+        The final linear layer of the MLP.
+    """
+
+    def __init__(
+        self,
+        n_input_units,
+        hidden_units_list=[64, 32, 32],
+        n_output_units: int = 1,
+        dropout_rate: float = 0.1,
+        use_skip_layers: bool = False,
+        activation_fn=nn.LeakyReLU(),
+        use_batch_norm: bool = False,
+    ):
+        super(MLP, self).__init__()
+        self.n_input_units = n_input_units
+        self.hidden_units_list = hidden_units_list
+        self.dropout_rate = dropout_rate
+        self.n_output_units = n_output_units
+
+        layers = []
+        input_units = n_input_units
+
+        for n_hidden_units in hidden_units_list:
+            if use_skip_layers and input_units == n_hidden_units:
+                layers.append(
+                    Linear_skip_block(
+                        input_units,
+                        n_hidden_units,
+                        dropout_rate,
+                        activation_fn,
+                        use_batch_norm,
+                    )
+                )
+            else:
+                layers.append(
+                    Linear_block(
+                        input_units,
+                        n_hidden_units,
+                        dropout_rate,
+                        activation_fn,
+                        use_batch_norm,
+                    )
+                )
+            input_units = n_hidden_units  # Update input_units for the next layer
+
+        self.hidden_layers = nn.Sequential(*layers)
+        self.linear_final = nn.Linear(input_units, n_output_units)  # Final layer
+
+    def forward(self, x):
+        """
+        Defines the forward pass of the MLP.
+
+        Parameters
+        ----------
+        x : Tensor
+            The input tensor to the MLP.
+
+        Returns
+        -------
+        Tensor
+            The output predictions of the model for regression tasks.
+        """
+        x = self.hidden_layers(x)
+        x = self.linear_final(x)
+        return x
diff --git a/mambular/utils/ple_encoding.py b/mambular/utils/ple_encoding.py
new file mode 100644
index 0000000..972b36b
--- /dev/null
+++ b/mambular/utils/ple_encoding.py
@@ -0,0 +1,156 @@
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
+import bisect
+import re
+from sklearn.tree import _tree
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+import pandas as pd
+import numpy as np
+from sklearn.base import TransformerMixin, BaseEstimator
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
+
+
+def tree_to_code(tree, feature_names):
+    """
+    Convert a scikit-learn decision tree into a list of conditions.
+
+    Args:
+        tree (sklearn.tree.DecisionTreeRegressor or sklearn.tree.DecisionTreeClassifier):
+            The decision tree model to be converted.
+        feature_names (list of str): The names of the features used in the tree.
+        Y (array-like): The target values associated with the tree.
+
+    Returns:
+        list of str: A list of conditions representing the decision tree paths.
+
+    Example:
+        # Convert a decision tree into a list of conditions
+        tree_conditions = tree_to_code(tree_model, feature_names, target_values)
+    """
+
+    tree_ = tree.tree_
+    feature_name = [
+        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
+        for i in tree_.feature
+    ]
+
+    pathto = dict()
+    my_list = []
+
+    global k
+    k = 0
+
+    def recurse(node, depth, parent):
+        global k
+        indent = "  " * depth
+
+        if tree_.feature[node] != _tree.TREE_UNDEFINED:
+            # name = df_name + "[" + "'" + feature_name[node]+ "'" + "]"
+            name = feature_name[node]
+            threshold = tree_.threshold[node]
+            s = "{} <= {} ".format(name, threshold, node)
+            if node == 0:
+                pathto[node] = "(" + s + ")"
+            else:
+                pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")"
+
+            recurse(tree_.children_left[node], depth + 1, node)
+            s = "{} > {}".format(name, threshold)
+            if node == 0:
+                pathto[node] = s
+            else:
+                pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")"
+            recurse(tree_.children_right[node], depth + 1, node)
+        else:
+            k = k + 1
+            my_list.append(pathto[parent])
+            # print(k,')',pathto[parent], tree_.value[node])
+
+    recurse(0, 1, 0)
+
+    return my_list
+
+
+class PLE(BaseEstimator, TransformerMixin):
+    def __init__(
+        self, n_bins=20, tree_params={}, task="regression", conditions=None, **kwargs
+    ):
+        super(PLE, self).__init__(**kwargs)
+
+        self.task = task
+        self.tree_params = tree_params
+        self.n_bins = n_bins
+        self.conditions = conditions
+        self.pattern = (
+            r"-?\d+\.?\d*[eE]?[+-]?\d*"  # This pattern matches integers and floats
+        )
+
+    def fit(self, feature, target):
+        if self.task == "regression":
+            dt = DecisionTreeRegressor(max_leaf_nodes=self.n_bins)
+        elif self.task == "classification":
+            dt = DecisionTreeClassifier(max_leaf_nodes=self.n_bins)
+        else:
+            raise ValueError("This task is not supported")
+
+        dt.fit(feature, target)
+
+        self.conditions = tree_to_code(dt, ["feature"])
+        return self
+
+    def transform(self, feature):
+        if feature.shape == (feature.shape[0], 1):
+            feature = np.squeeze(feature, axis=1)
+        else:
+            feature = feature
+        result_list = []
+        for idx, cond in enumerate(self.conditions):
+            result_list.append(eval(cond) * (idx + 1))
+
+        encoded_feature = np.expand_dims(np.sum(np.stack(result_list).T, axis=1), 1)
+
+        encoded_feature = np.array(encoded_feature - 1, dtype=np.int64)
+
+        # Initialize an empty list to store the extracted numbers
+        locations = []
+        # Iterate through the strings and extract numbers
+        for string in self.conditions:
+            matches = re.findall(self.pattern, string)
+            locations.extend(matches)
+
+        locations = [float(number) for number in locations]
+        locations = list(set(locations))
+        locations = np.sort(locations)
+
+        ple_encoded_feature = np.zeros((len(feature), locations.shape[0] + 1))
+        if locations[-1] > np.max(feature):
+            locations[-1] = np.max(feature)
+
+        for idx in range(len(encoded_feature)):
+            if feature[idx] >= locations[-1]:
+                ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx]
+                ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1
+            elif feature[idx] <= locations[0]:
+                ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx]
+
+            else:
+                ple_encoded_feature[idx][encoded_feature[idx]] = (
+                    feature[idx] - locations[(encoded_feature[idx] - 1)[0]]
+                ) / (
+                    locations[(encoded_feature[idx])[0]]
+                    - locations[(encoded_feature[idx] - 1)[0]]
+                )
+
+                ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1
+
+        if ple_encoded_feature.shape[1] == 1:
+            return np.zeros([len(feature), self.n_bins])
+
+        else:
+            return np.array(ple_encoded_feature, dtype=np.float32)
+
+    def get_feature_names_out(self, input_features=None):
+        if input_features is None:
+            raise ValueError("input_features must be specified")
+        return input_features
diff --git a/mambular/utils/prepro_utils.py b/mambular/utils/prepro_utils.py
new file mode 100644
index 0000000..4bb9fa7
--- /dev/null
+++ b/mambular/utils/prepro_utils.py
@@ -0,0 +1,167 @@
+import pandas as pd
+import numpy as np
+from sklearn.base import TransformerMixin, BaseEstimator
+
+
+class CustomBinner(TransformerMixin):
+    def __init__(self, bins):
+        # bins can be a scalar (number of bins) or array-like (bin edges)
+        self.bins = bins
+
+    def fit(self, X, y=None):
+        # Fit doesn't need to do anything as we are directly using provided bins
+        return self
+
+    def transform(self, X):
+        if isinstance(self.bins, int):
+            # Calculate equal width bins based on the range of the data and number of bins
+            _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True)
+        else:
+            # Use predefined bins
+            bins = self.bins
+
+        # Apply the bins to the data
+        binned_data = pd.cut(
+            X.squeeze(),
+            bins=np.sort(np.unique(bins)),
+            labels=False,
+            include_lowest=True,
+        )
+        print(binned_data)
+        return np.expand_dims(np.array(binned_data), 1)
+
+
+class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin):
+    """
+    This encoder converts categorical features into continuous integer values. Each unique category within a feature
+    is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for
+    models that can only handle continuous data.
+
+    Attributes:
+        mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers
+                                  for a single feature.
+
+    Methods:
+        fit(X, y=None): Learns the mapping from original categories to integers.
+        transform(X): Applies the learned mapping to the data.
+        get_feature_names_out(input_features=None): Returns the input features after transformation.
+    """
+
+    def fit(self, X, y=None):
+        """
+        Learns the mapping from original categories to integers for each feature.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to fit.
+            y (ignored): Not used, present for API consistency by convention.
+
+        Returns:
+            self: Returns the instance itself.
+        """
+        # Fit should determine the mapping from original categories to sequential integers starting from 0
+        self.mapping_ = [
+            {category: i for i, category in enumerate(np.unique(col))} for col in X.T
+        ]
+        return self
+
+    def transform(self, X):
+        """
+        Transforms the categories in X to their corresponding integer values based on the learned mapping.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to transform.
+
+        Returns:
+            X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values.
+        """
+        # Transform the categories to their mapped integer values
+        X_transformed = np.array(
+            [
+                [self.mapping_[col].get(value, -1) for col, value in enumerate(row)]
+                for row in X
+            ]
+        )
+        return X_transformed
+
+    def get_feature_names_out(self, input_features=None):
+        """
+        Returns the names of the transformed features.
+
+        Parameters:
+            input_features (list of str): The names of the input features.
+
+        Returns:
+            input_features (array of shape (n_features,)): The names of the output features after transformation.
+        """
+        if input_features is None:
+            raise ValueError("input_features must be specified")
+        return input_features
+
+
+class OneHotFromOrdinal(TransformerMixin, BaseEstimator):
+    """
+    A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful
+    in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required
+    for model training.
+
+    Attributes:
+        max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature,
+                                                    determining the size of the one-hot encoded array for that feature.
+
+    Methods:
+        fit(X, y=None): Learns the maximum bin index for each feature.
+        transform(X): Converts ordinal-encoded features into one-hot format.
+        get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding.
+    """
+
+    def fit(self, X, y=None):
+        """
+        Learns the maximum bin index for each feature from the data.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features.
+            y (ignored): Not used, present for API consistency by convention.
+
+        Returns:
+            self: Returns the instance itself.
+        """
+        self.max_bins_ = (
+            np.max(X, axis=0).astype(int) + 1
+        )  # Find the maximum bin index for each feature
+        return self
+
+    def transform(self, X):
+        """
+        Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting.
+
+        Parameters:
+            X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features.
+
+        Returns:
+            X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features.
+        """
+        # Initialize an empty list to hold the one-hot encoded arrays
+        one_hot_encoded = []
+        for i, max_bins in enumerate(self.max_bins_):
+            # Convert each feature to one-hot using its max_bins
+            feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)]
+            one_hot_encoded.append(feature_one_hot)
+        # Concatenate the one-hot encoded features horizontally
+        return np.hstack(one_hot_encoded)
+
+    def get_feature_names_out(self, input_features=None):
+        """
+        Generates feature names for the one-hot encoded features based on the input feature names and the number of bins.
+
+        Parameters:
+            input_features (list of str): The names of the input features that were ordinal-encoded.
+
+        Returns:
+            feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features.
+        """
+        feature_names = []
+        for i, max_bins in enumerate(self.max_bins_):
+            feature_names.extend(
+                [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))]
+            )
+        return np.array(feature_names)
diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py
index 2056c69..b754951 100644
--- a/mambular/utils/preprocessor.py
+++ b/mambular/utils/preprocessor.py
@@ -1,180 +1,17 @@
 import numpy as np
 import pandas as pd
-from sklearn.base import BaseEstimator, TransformerMixin
+import numpy as np
+from .prepro_utils import OneHotFromOrdinal, CustomBinner, ContinuousOrdinalEncoder
+from sklearn.preprocessing import (
+    StandardScaler,
+    KBinsDiscretizer,
+    MinMaxScaler,
+)
+from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
 from sklearn.compose import ColumnTransformer
-from sklearn.exceptions import NotFittedError
-from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import (KBinsDiscretizer, MinMaxScaler,
-                                   StandardScaler)
-from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
-
-__all__ = ['Preprocessor']
-
-
-class CustomBinner(TransformerMixin):
-    def __init__(self, bins):
-        # bins can be a scalar (number of bins) or array-like (bin edges)
-        self.bins = bins
-
-    def fit(self, X, y=None):
-        # Fit doesn't need to do anything as we are directly using provided bins
-        return self
-
-    def transform(self, X):
-        if isinstance(self.bins, int):
-            # Calculate equal width bins based on the range of the data and number of bins
-            _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True)
-        else:
-            # Use predefined bins
-            bins = self.bins
-
-        # Apply the bins to the data
-        binned_data = pd.cut(
-            X.squeeze(),
-            bins=np.sort(np.unique(bins)),
-            labels=False,
-            include_lowest=True,
-        )
-        print(binned_data)
-        return np.expand_dims(np.array(binned_data), 1)
-
-
-class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin):
-    """
-    This encoder converts categorical features into continuous integer values. Each unique category within a feature
-    is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for
-    models that can only handle continuous data.
-
-    Attributes:
-        mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers
-                                  for a single feature.
-
-    Methods:
-        fit(X, y=None): Learns the mapping from original categories to integers.
-        transform(X): Applies the learned mapping to the data.
-        get_feature_names_out(input_features=None): Returns the input features after transformation.
-    """
-
-    def fit(self, X, y=None):
-        """
-        Learns the mapping from original categories to integers for each feature.
-
-        Parameters:
-            X (array-like of shape (n_samples, n_features)): The input data to fit.
-            y (ignored): Not used, present for API consistency by convention.
-
-        Returns:
-            self: Returns the instance itself.
-        """
-        # Fit should determine the mapping from original categories to sequential integers starting from 0
-        self.mapping_ = [
-            {category: i for i, category in enumerate(np.unique(col))} for col in X.T
-        ]
-        return self
-
-    def transform(self, X):
-        """
-        Transforms the categories in X to their corresponding integer values based on the learned mapping.
-
-        Parameters:
-            X (array-like of shape (n_samples, n_features)): The input data to transform.
-
-        Returns:
-            X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values.
-        """
-        # Transform the categories to their mapped integer values
-        X_transformed = np.array(
-            [
-                [self.mapping_[col].get(value, -1)
-                 for col, value in enumerate(row)]
-                for row in X
-            ]
-        )
-        return X_transformed
-
-    def get_feature_names_out(self, input_features=None):
-        """
-        Returns the names of the transformed features.
-
-        Parameters:
-            input_features (list of str): The names of the input features.
-
-        Returns:
-            input_features (array of shape (n_features,)): The names of the output features after transformation.
-        """
-        if input_features is None:
-            raise ValueError("input_features must be specified")
-        return input_features
-
-
-class OneHotFromOrdinal(TransformerMixin, BaseEstimator):
-    """
-    A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful
-    in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required
-    for model training.
-
-    Attributes:
-        max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature,
-                                                    determining the size of the one-hot encoded array for that feature.
-
-    Methods:
-        fit(X, y=None): Learns the maximum bin index for each feature.
-        transform(X): Converts ordinal-encoded features into one-hot format.
-        get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding.
-    """
-
-    def fit(self, X, y=None):
-        """
-        Learns the maximum bin index for each feature from the data.
-
-        Parameters:
-            X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features.
-            y (ignored): Not used, present for API consistency by convention.
-
-        Returns:
-            self: Returns the instance itself.
-        """
-        self.max_bins_ = (
-            np.max(X, axis=0).astype(int) + 1
-        )  # Find the maximum bin index for each feature
-        return self
-
-    def transform(self, X):
-        """
-        Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting.
-
-        Parameters:
-            X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features.
-
-        Returns:
-            X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features.
-        """
-        # Initialize an empty list to hold the one-hot encoded arrays
-        one_hot_encoded = []
-        for i, max_bins in enumerate(self.max_bins_):
-            # Convert each feature to one-hot using its max_bins
-            feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)]
-            one_hot_encoded.append(feature_one_hot)
-        # Concatenate the one-hot encoded features horizontally
-        return np.hstack(one_hot_encoded)
-
-    def get_feature_names_out(self, input_features=None):
-        """
-        Generates feature names for the one-hot encoded features based on the input feature names and the number of bins.
-
-        Parameters:
-            input_features (list of str): The names of the input features that were ordinal-encoded.
-
-        Returns:
-            feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features.
-        """
-        feature_names = []
-        for i, max_bins in enumerate(self.max_bins_):
-            feature_names.extend(
-                [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))]
-            )
-        return np.array(feature_names)
+from sklearn.impute import SimpleImputer
+from .ple_encoding import PLE
 
 
 class Preprocessor:
@@ -196,11 +33,15 @@ class Preprocessor:
         use_decision_tree_bins (bool): If True, uses decision tree regression/classification to determine
                                        optimal bin edges for numerical feature binning. This parameter is
                                        relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'.
+        binning_strategy (str): Defines the strategy for binning numerical features. Options include 'uniform',
+                                'quantile', or other sklearn-compatible strategies.
+        task (str): Indicates the type of machine learning task ('regression' or 'classification'). This can
+                    influence certain preprocessing behaviors, especially when using decision tree-based binning.
 
-    Attributes
-    ----------
-        column_transformer (ColumnTransformer): A sklearn ColumnTransformer instance that holds the configured
-                                                preprocessing pipelines for the different feature types.
+    Attributes:
+        column_transformer (ColumnTransformer): An instance of sklearn's ColumnTransformer that holds the
+                                                configured preprocessing pipelines for different feature types.
+        fitted (bool): Indicates whether the preprocessor has been fitted to the data.
 
     """
 
@@ -210,13 +51,25 @@ def __init__(
         numerical_preprocessing="binning",
         use_decision_tree_bins=False,
         binning_strategy="uniform",
+        task="regression",
     ):
         self.n_bins = n_bins
-        self.numerical_preprocessing = numerical_preprocessing
+        self.numerical_preprocessing = numerical_preprocessing.lower()
+        if self.numerical_preprocessing not in [
+            "ple",
+            "binning",
+            "one_hot",
+            "standardization",
+            "normalization",
+        ]:
+            raise ValueError(
+                "Invalid numerical_preprocessing value. Supported values are 'ple', 'binning', 'one_hot', 'standardization', and 'normalization'."
+            )
         self.use_decision_tree_bins = use_decision_tree_bins
         self.column_transformer = None
         self.fitted = False
         self.binning_strategy = binning_strategy
+        self.task = task
 
     def set_params(self, **params):
         for key, value in params.items():
@@ -244,8 +97,7 @@ def _detect_column_types(self, X):
             num_unique_values = X[col].nunique()
             total_samples = len(X[col])
             if X[col].dtype.kind not in "iufc" or (
-                X[col].dtype.kind == "i" and (
-                    num_unique_values / total_samples) < 0.05
+                X[col].dtype.kind == "i" and (num_unique_values / total_samples) < 0.05
             ):
                 categorical_features.append(col)
             else:
@@ -278,8 +130,7 @@ def fit(self, X, y=None):
 
                 if self.numerical_preprocessing in ["binning", "one_hot"]:
                     bins = (
-                        self._get_decision_tree_bins(
-                            X[[feature]], y, [feature])
+                        self._get_decision_tree_bins(X[[feature]], y, [feature])
                         if self.use_decision_tree_bins
                         else self.n_bins
                     )
@@ -294,8 +145,7 @@ def fit(self, X, y=None):
                                         else len(bins) - 1,
                                         encode="ordinal",
                                         strategy=self.binning_strategy,
-                                        subsample=200_000 if len(
-                                            X) > 200_000 else None,
+                                        subsample=200_000 if len(X) > 200_000 else None,
                                     ),
                                 ),
                             ]
@@ -318,17 +168,20 @@ def fit(self, X, y=None):
                         )
 
                 elif self.numerical_preprocessing == "standardization":
-                    numeric_transformer_steps.append(
-                        ("scaler", StandardScaler()))
+                    numeric_transformer_steps.append(("scaler", StandardScaler()))
 
                 elif self.numerical_preprocessing == "normalization":
+                    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
+
+                elif self.numerical_preprocessing == "ple":
+                    numeric_transformer_steps.append(("normalizer", MinMaxScaler()))
                     numeric_transformer_steps.append(
-                        ("normalizer", MinMaxScaler()))
+                        ("ple", PLE(n_bins=self.n_bins, task=self.task))
+                    )
 
                 numeric_transformer = Pipeline(numeric_transformer_steps)
 
-                transformers.append(
-                    (f"num_{feature}", numeric_transformer, [feature]))
+                transformers.append((f"num_{feature}", numeric_transformer, [feature]))
 
         if categorical_features:
             for feature in categorical_features:
@@ -375,60 +228,59 @@ def _get_decision_tree_bins(self, X, y, numerical_features):
             bin_edges = np.sort(np.unique(thresholds))
 
             bins.append(
-                np.concatenate(
-                    ([X[feature].min()], bin_edges, [X[feature].max()]))
+                np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()]))
             )
         return bins
 
     def transform(self, X):
         """
-        Transforms the dataset using the fitted preprocessing pipelines. This method applies the transformations set up during the fitting process
-        to the input data and returns a dictionary with the transformed data.
+        Transforms the input data using the preconfigured column transformer and converts the output into a dictionary
+        format with keys corresponding to transformed feature names and values as arrays of transformed data.
+
+        This method converts the sparse or dense matrix returned by the column transformer into a more accessible
+        dictionary format, where each key-value pair represents a feature and its transformed data.
 
         Parameters:
-            X (DataFrame or dict): The input dataset to be transformed.
+            X (DataFrame): The input data to be transformed.
 
         Returns:
-            dict: A dictionary where keys are the base feature names and values are the transformed features as arrays.
+            dict: A dictionary where keys are the names of the features (as per the transformations defined in the
+            column transformer) and the values are numpy arrays of the transformed data.
         """
-        if not self.fitted:
-            raise NotFittedError(
-                "This Preprocessor instance is not fitted yet. Call 'fit' with appropriate arguments before using this method."
-            )
+        transformed_X = self.column_transformer.transform(X)
 
-        if isinstance(X, dict):
-            X = pd.DataFrame(X)
-
-        # Transform X using the column transformer
-        transformed_X = self.column_transformer.transform(
-            X
-        )  # To understand the shape of the transformed data
-
-        # Initialize the transformed dictionary
-        transformed_dict = {}
+        # Now let's convert this into a dictionary of arrays, one per column
+        transformed_dict = self._split_transformed_output(X, transformed_X)
+        return transformed_dict
 
-        # Retrieve output feature names from the column transformer
-        output_features = self.column_transformer.get_feature_names_out()
+    def _split_transformed_output(self, X, transformed_X):
+        """
+        Splits the transformed data array into a dictionary where keys correspond to the original column names or
+        feature groups and values are the transformed data for those columns.
 
-        # Iterate over each output feature name to populate the transformed_dict
-        for i, col in enumerate(output_features):
-            # Extract the base feature name (before any transformation)
-            base_feature = col.split("__")[0]
+        This helper method is utilized within `transform` to segregate the transformed data based on the
+        specification in the column transformer, assigning each transformed section to its corresponding feature name.
 
-            # If the base feature name already exists in the dictionary, append the new data
-            if base_feature in transformed_dict:
-                transformed_dict[base_feature] = np.vstack(
-                    [transformed_dict[base_feature], transformed_X[:, i]]
-                )
-            else:
-                # Otherwise, create a new entry in the dictionary
-                transformed_dict[base_feature] = transformed_X[:, i]
+        Parameters:
+            X (DataFrame): The original input data, used for determining shapes and transformations.
+            transformed_X (numpy array): The transformed data as a numpy array, outputted by the column transformer.
 
-        # Ensure all arrays in the dictionary are the correct shape
-        for key in transformed_dict.keys():
-            transformed_dict[key] = (
-                transformed_dict[key].reshape(-1, transformed_X.shape[0]).T
-            )
+        Returns:
+            dict: A dictionary mapping each transformation's name to its respective numpy array of transformed data.
+            The type of each array (int or float) is determined based on the type of transformation applied.
+        """
+        start = 0
+        transformed_dict = {}
+        for (
+            name,
+            transformer,
+            columns,
+        ) in self.column_transformer.transformers_:  # skip 'remainder'
+            if transformer != "drop":
+                end = start + transformer.transform(X[[columns[0]]]).shape[1]
+                dtype = int if "cat" in name else float
+                transformed_dict[name] = transformed_X[:, start:end].astype(dtype)
+                start = end
 
         return transformed_dict
 
@@ -449,12 +301,23 @@ def fit_transform(self, X, y=None):
 
     def get_feature_info(self):
         """
-        Returns detailed information about the processed features, including the number of bins for binned features
-        and the dimensionality of encoded features. This method is useful for understanding the transformations applied to each feature.
+        Retrieves information about how features are encoded within the model's preprocessor.
+        This method identifies the type of encoding applied to each feature, categorizing them into binned or ordinal
+        encodings and other types of encodings (e.g., one-hot encoding after discretization).
+
+        This method should only be called after the preprocessor has been fitted, as it relies on the structure and
+        configuration of the `column_transformer` attribute.
+
+        Raises:
+            RuntimeError: If the `column_transformer` is not yet fitted, indicating that the preprocessor must be
+            fitted before invoking this method.
 
         Returns:
-            tuple: A tuple containing two dictionaries, the first with information about binned or ordinal encoded features and
-                   the second with information about other encoded features.
+            tuple of (dict, dict):
+                - The first dictionary maps feature names to their respective number of bins or categories if they are
+                  processed using discretization or ordinal encoding.
+                - The second dictionary includes feature names with other encoding details, such as the dimension of
+                  features after encoding transformations (e.g., one-hot encoding dimensions).
         """
         binned_or_ordinal_info = {}
         other_encoding_info = {}
@@ -473,8 +336,7 @@ def get_feature_info(self):
                 # Handle features processed with discretization
                 if "discretizer" in steps:
                     step = transformer_pipeline.named_steps["discretizer"]
-                    n_bins = step.n_bins_[0] if hasattr(
-                        step, "n_bins_") else None
+                    n_bins = step.n_bins_[0] if hasattr(step, "n_bins_") else None
 
                     # Check if discretization is followed by one-hot encoding
                     if "onehot_from_ordinal" in steps:
@@ -495,8 +357,7 @@ def get_feature_info(self):
                 # Handle features processed with continuous ordinal encoding
                 elif "continuous_ordinal" in steps:
                     step = transformer_pipeline.named_steps["continuous_ordinal"]
-                    n_categories = len(
-                        step.mapping_[columns.index(feature_name)])
+                    n_categories = len(step.mapping_[columns.index(feature_name)])
                     binned_or_ordinal_info[feature_name] = n_categories
                     print(
                         f"Categorical Feature (Ordinal Encoded): {feature_name}, Number of unique categories: {n_categories}"
@@ -511,7 +372,7 @@ def get_feature_info(self):
                         )
                         other_encoding_info[feature_name] = transformed_feature.shape[1]
                         print(
-                            f"Feature: {feature_name} ({self.numerical_preprocessing}), Encoded feature dimension: {transformed_feature.shape[1]}"
+                            f"Feature: {feature_name} (Other Encoding), Encoded feature dimension: {transformed_feature.shape[1]}"
                         )
 
                 print("-" * 50)
diff --git a/setup.py b/setup.py
index ee0c16d..5ce6018 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 from setuptools import find_packages, setup
 
 # Package meta-data.
-NAME = 'mambular'
+NAME = "mambular"
 DESCRIPTION = "A python package for tabular deep learning with mamba blocks."
 HOMEPAGE = "https://github.com/basf/mamba-tabular"
 DOCS = "https://mambular.readthedocs.io/en/latest/index.html"
@@ -16,7 +16,7 @@
 
 # Load the package's verison file and its content.
 ROOT_DIR = Path(__file__).resolve().parent
-PACKAGE_DIR = ROOT_DIR / 'mambular'
+PACKAGE_DIR = ROOT_DIR / "mambular"
 
 with open(PACKAGE_DIR / "__version__.py") as f:
     VERSION = f.readlines()[-1].split()[-1].strip("\"'")
@@ -24,33 +24,36 @@
 # ger install_reqs from requirements file, used for setup function later
 with open(os.path.join(ROOT_DIR, "requirements.txt")) as f:
     # next(f)
-    install_reqs = [line.rstrip() for line in f.readlines()
-                    if not line.startswith("#") and not line.startswith("git+")]
+    install_reqs = [
+        line.rstrip()
+        for line in f.readlines()
+        if not line.startswith("#") and not line.startswith("git+")
+    ]
 
 
 # get long description from readme file
 with open(os.path.join(ROOT_DIR, "README.md")) as f:
     LONG_DESCRIPTION = f.read()
 
-setup(name=NAME,
-      version=VERSION,
-      description=DESCRIPTION,
-      long_description=LONG_DESCRIPTION,
-      long_description_content_type="text/markdown",
-      author=AUTHOR,
-      author_email=EMAIL,
-      python_requires=REQUIRES_PYTHON,
-      install_requires=install_reqs,
-      # extras_require=extras_reqs,
-      license="Copyright (c) 2024 BASF SE",  # adapt based on your needs
-      packages=find_packages(),
-      classifiers=[
-          "Programming Language :: Python :: 3",
-          "License :: OSI Approved :: MIT License",
-          "Operating System :: OS Independent",
-      ],
-      include_package_data=True,
-      project_urls={'Homepage:': HOMEPAGE,
-                    'Documentation': DOCS},
-      url=HOMEPAGE
-      )
+setup(
+    name=NAME,
+    version=VERSION,
+    description=DESCRIPTION,
+    long_description=LONG_DESCRIPTION,
+    long_description_content_type="text/markdown",
+    author=AUTHOR,
+    author_email=EMAIL,
+    python_requires=REQUIRES_PYTHON,
+    install_requires=install_reqs,
+    # extras_require=extras_reqs,
+    license="Copyright (c) 2024 BASF SE",  # adapt based on your needs
+    packages=find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    include_package_data=True,
+    project_urls={"Homepage:": HOMEPAGE, "Documentation": DOCS},
+    url=HOMEPAGE,
+)
diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py
index bdcc26b..fb43c64 100644
--- a/tests/test_preprocessor.py
+++ b/tests/test_preprocessor.py
@@ -41,7 +41,14 @@ def test_fit_transform(self):
         """Test fitting and transforming the data."""
         pp = Preprocessor(numerical_preprocessing="standardization")
         transformed_data = pp.fit_transform(self.data)
-        print(transformed_data)
+        self.assertIsInstance(transformed_data, dict)
+        self.assertTrue("num_numerical" in transformed_data)
+        self.assertTrue("cat_categorical" in transformed_data)
+
+    def test_ple(self):
+        """Test fitting and transforming the data."""
+        pp = Preprocessor(numerical_preprocessing="ple", n_bins=20)
+        transformed_data = pp.fit_transform(self.data)
         self.assertIsInstance(transformed_data, dict)
         self.assertTrue("num_numerical" in transformed_data)
         self.assertTrue("cat_categorical" in transformed_data)

From 4e530a1d0e935c385fb46b3f08486c3474bc5285 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 07:32:41 +0000
Subject: [PATCH 03/21] include ple encodings

---
 mambular/utils/preprocessor.py | 39 +---------------------------------
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py
index fef0dc2..767e4f1 100644
--- a/mambular/utils/preprocessor.py
+++ b/mambular/utils/preprocessor.py
@@ -58,7 +58,6 @@ def __init__(
         use_decision_tree_bins=False,
         binning_strategy="uniform",
         task="regression",
-        task="regression",
     ):
         self.n_bins = n_bins
         self.numerical_preprocessing = numerical_preprocessing.lower()
@@ -284,32 +283,6 @@ def transform(self, X):
         transformed_dict = self._split_transformed_output(X, transformed_X)
         return transformed_dict
 
-    def _split_transformed_output(self, X, transformed_X):
-        """
-        Splits the transformed data array into a dictionary where keys correspond to the original column names or
-        feature groups and values are the transformed data for those columns.
-
-        This helper method is utilized within `transform` to segregate the transformed data based on the
-        specification in the column transformer, assigning each transformed section to its corresponding feature name.
-
-        Parameters:
-            X (DataFrame): The original input data, used for determining shapes and transformations.
-            transformed_X (numpy array): The transformed data as a numpy array, outputted by the column transformer.
-
-        Returns:
-            dict: A dictionary mapping each transformation's name to its respective numpy array of transformed data.
-            The type of each array (int or float) is determined based on the type of transformation applied.
-        """
-        start = 0
-            dict: A dictionary where keys are the names of the features (as per the transformations defined in the
-            column transformer) and the values are numpy arrays of the transformed data.
-        """
-        transformed_X = self.column_transformer.transform(X)
-
-        # Now let's convert this into a dictionary of arrays, one per column
-        transformed_dict = self._split_transformed_output(X, transformed_X)
-        return transformed_dict
-
     def _split_transformed_output(self, X, transformed_X):
         """
         Splits the transformed data array into a dictionary where keys correspond to the original column names or
@@ -332,17 +305,7 @@ def _split_transformed_output(self, X, transformed_X):
             name,
             transformer,
             columns,
-        ) in self.column_transformer.transformers_:  # skip 'remainder'
-            if transformer != "drop":
-                end = start + transformer.transform(X[[columns[0]]]).shape[1]
-                dtype = int if "cat" in name else float
-                transformed_dict[name] = transformed_X[:, start:end].astype(dtype)
-                start = end
-        for (
-            name,
-            transformer,
-            columns,
-        ) in self.column_transformer.transformers_:  # skip 'remainder'
+        ) in self.column_transformer.transformers_:
             if transformer != "drop":
                 end = start + transformer.transform(X[[columns[0]]]).shape[1]
                 dtype = int if "cat" in name else float

From 1098f9450a96a9b180689e9de0166363f255984f Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 11:37:58 +0000
Subject: [PATCH 04/21] restructure regression module

---
 mambular/base_models/embedding_regressor.py |  40 ++--
 mambular/base_models/regressor.py           | 194 ++++++++++---------
 mambular/models/sklearn_regressor.py        | 201 +++++++++++++-------
 mambular/utils/default_mamba_params.py      |  34 ++++
 mambular/utils/mamba_arch.py                | 191 +++++++++++++++----
 mambular/utils/normalization_layers.py      |   1 -
 mambular/utils/preprocessor.py              |   4 +-
 7 files changed, 441 insertions(+), 224 deletions(-)
 create mode 100644 mambular/utils/default_mamba_params.py

diff --git a/mambular/base_models/embedding_regressor.py b/mambular/base_models/embedding_regressor.py
index 2b9904e..d7c3b93 100644
--- a/mambular/base_models/embedding_regressor.py
+++ b/mambular/base_models/embedding_regressor.py
@@ -4,6 +4,7 @@
 
 from ..utils.config import MambularConfig
 from ..utils.mamba_arch import Mamba
+from ..utils.mlp_utils import MLP
 
 
 class BaseEmbeddingMambularRegressor(pl.LightningModule):
@@ -57,6 +58,12 @@ def __init__(
         lr_factor=0.75,
         seq_size: int = 20,
         raw_embeddings=False,
+        head_layer_sizes=[64, 32, 32],
+        head_dropout: float = 0.3,
+        head_skip_layers: bool = False,
+        head_activation="leakyrelu",
+        head_use_batch_norm: bool = False,
+        attn_dropout: float = 0.3,
     ):
         super().__init__()
 
@@ -97,8 +104,7 @@ def __init__(
             self.num_embeddings = nn.ModuleList(
                 [
                     nn.Sequential(
-                        nn.Linear(self.seq_size,
-                                  self.config.d_model, bias=False),
+                        nn.Linear(self.seq_size, self.config.d_model, bias=False),
                         # Example using ReLU as the activation function, change as needed
                         self.embedding_activation,
                     )
@@ -128,26 +134,17 @@ def __init__(
 
         self.mamba = Mamba(self.config)
         self.norm_f = self.config.norm(self.config.d_model)
-        mlp_activation_fn = activations.get(
-            self.config.tabular_head_activation.lower(), nn.Identity()
-        )
-
-        # Dynamically create MLP layers based on config.tabular_units
-        mlp_layers = []
-        input_dim = self.config.d_model  # Initial input dimension
-
-        # Iterate over the specified units for each layer in the MLP
-        for units in self.config.tabular_head_units:
-            mlp_layers.append(nn.Linear(input_dim, units))
-            mlp_layers.append(mlp_activation_fn)
-            mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout))
-            input_dim = units
-
-        # Add the final linear layer to map to a single output value
-        mlp_layers.append(nn.Linear(input_dim, 1))
+        head_activation = activations.get(head_activation.lower(), nn.Identity())
 
         # Combine all layers into a Sequential module
-        self.tabular_head = nn.Sequential(*mlp_layers)
+        self.tabular_head = MLP(
+            self.config.d_model,
+            hidden_units_list=head_layer_sizes,
+            dropout_rate=head_dropout,
+            use_skip_layers=head_skip_layers,
+            activation_fn=head_activation,
+            use_batch_norm=head_use_batch_norm,
+        )
 
         self.pooling_method = self.config.pooling_method
         self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model))
@@ -176,8 +173,7 @@ def forward(self, cat_features, num_features):
             The output predictions of the model for regression tasks.
         """
         batch_size = (
-            cat_features[0].size(0) if cat_features != [
-            ] else num_features[0].size(0)
+            cat_features[0].size(0) if cat_features != [] else num_features[0].size(0)
         )
         cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         # Process categorical features if present
diff --git a/mambular/base_models/regressor.py b/mambular/base_models/regressor.py
index 3b0f721..2ff671c 100644
--- a/mambular/base_models/regressor.py
+++ b/mambular/base_models/regressor.py
@@ -1,68 +1,38 @@
 import lightning as pl
 import torch
 import torch.nn as nn
-
-from ..utils.config import MambularConfig
 from ..utils.mamba_arch import Mamba
+from ..utils.mlp_utils import MLP
+from ..utils.normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
+from ..utils.default_mamba_params import DefaultConfig
 
 
 class BaseMambularRegressor(pl.LightningModule):
-    """
-    A base regression module for tabular data built on PyTorch Lightning. It incorporates embeddings
-    for categorical and numerical features with a configurable architecture provided by MambularConfig.
-    This module is designed for regression tasks.
-
-    Parameters
-    ----------
-    config : MambularConfig
-        An instance of MambularConfig containing configuration parameters for the model architecture.
-    cat_feature_info : dict, optional
-        A dictionary mapping the names of categorical features to their number of unique categories. Defaults to None.
-    num_feature_info : dict, optional
-        A dictionary mapping the names of numerical features to their number of dimensions after embedding. Defaults to None.
-    lr : float, optional
-        The initial learning rate for the optimizer. Defaults to 1e-03.
-    lr_patience : int, optional
-        The number of epochs with no improvement after which learning rate will be reduced. Defaults to 10.
-    weight_decay : float, optional
-        Weight decay (L2 penalty) coefficient. Defaults to 0.025.
-    lr_factor : float, optional
-        Factor by which the learning rate will be reduced. Defaults to 0.75.
-
-
-    Attributes
-    ----------
-    mamba : Mamba
-        The core neural network module implementing the Mamba architecture.
-    norm_f : nn.Module
-        Normalization layer applied after the Mamba block.
-    tabular_head : nn.Linear
-        Final linear layer mapping the features to a single output for regression tasks.
-    train_mse : torchmetrics.MeanSquaredError
-        Metric computation module for training Mean Squared Error.
-    val_mse : torchmetrics.MeanSquaredError
-        Metric computation module for validation Mean Squared Error.
-    loss_fct : torch.nn.MSELoss
-        The loss function for regression tasks.
-    """
-
     def __init__(
         self,
-        config: MambularConfig,
-        cat_feature_info: dict = None,
-        num_feature_info: dict = None,
-        lr=1e-03,
-        lr_patience=10,
-        weight_decay=0.025,
-        lr_factor=0.75,
+        cat_feature_info,
+        num_feature_info,
+        config: DefaultConfig = DefaultConfig(),
+        **kwargs,
     ):
         super().__init__()
 
-        self.config = config
-        self.lr = lr
-        self.lr_patience = lr_patience
-        self.weight_decay = weight_decay
-        self.lr_factor = lr_factor
+        # Save all hyperparameters
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        # Assigning values from hyperparameters
+        self.lr = self.hparams.get("lr", config.lr)
+        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
+        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
+        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
@@ -75,67 +45,108 @@ def __init__(
             "selu": nn.SELU(),
             "gelu": nn.GELU(),
             "softplus": nn.Softplus(),
-            "leakyrelu": nn.LeakyReLU(),
             "linear": nn.Identity(),
+            "silu": nn.functional.silu,
         }
 
-        self.embedding_activation = activations.get(
-            self.config.num_embedding_activation.lower()
+        self.embedding_activation = self.hparams.get(
+            "num_embedding_activation", config.num_embedding_activation
+        )
+
+        # Additional layers and components initialization based on hyperparameters
+        self.mamba = Mamba(
+            d_model=self.hparams.get("d_model", config.d_model),
+            n_layers=self.hparams.get("n_layers", config.n_layers),
+            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
+            bias=self.hparams.get("bias", config.bias),
+            d_conv=self.hparams.get("d_conv", config.d_conv),
+            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
+            dropout=self.hparams.get("dropout", config.dropout),
+            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
+            d_state=self.hparams.get("d_state", config.d_state),
+            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
+            dt_init=self.hparams.get("dt_init", config.dt_init),
+            dt_max=self.hparams.get("dt_max", config.dt_max),
+            dt_min=self.hparams.get("dt_min", config.dt_min),
+            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
+            norm=globals()[self.hparams.get("norm", config.norm)],
+            activation=self.hparams.get("activation", config.activation),
         )
+
+        # Set the normalization layer dynamically
+        norm_layer = self.hparams.get("norm", config.norm)
+        if norm_layer == "RMSNorm":
+            self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LayerNorm":
+            self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "BatchNorm":
+            self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "InstanceNorm":
+            self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "GroupNorm":
+            self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LearnableLayerScaling":
+            self.norm_f = LearnableLayerScaling(
+                self.hparams.get("d_model", config.d_model)
+            )
+        else:
+            raise ValueError(f"Unsupported normalization layer: {norm_layer}")
+
         if self.embedding_activation is None:
             raise ValueError(
-                f"Unsupported activation function: {self.config.num_embedding_activation}"
+                f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}"
             )
 
         self.num_embeddings = nn.ModuleList(
             [
                 nn.Sequential(
-                    nn.Linear(input_shape, self.config.d_model, bias=False),
-                    # Example using ReLU as the activation function, change as needed
+                    nn.Linear(
+                        input_shape,
+                        self.hparams.get("d_model", config.d_model),
+                        bias=False,
+                    ),
                     self.embedding_activation,
                 )
                 for feature_name, input_shape in num_feature_info.items()
             ]
         )
 
-        # Create embedding layers for categorical features based on cat_feature_info
         self.cat_embeddings = nn.ModuleList(
             [
-                nn.Embedding(num_categories + 1, self.config.d_model)
+                nn.Embedding(
+                    num_categories + 1, self.hparams.get("d_model", config.d_model)
+                )
                 for feature_name, num_categories in cat_feature_info.items()
             ]
         )
 
-        self.mamba = Mamba(self.config)
-        self.norm_f = self.config.norm(self.config.d_model)
-        mlp_activation_fn = activations.get(
-            self.config.tabular_head_activation.lower(), nn.Identity()
-        )
-
-        # Dynamically create MLP layers based on config.tabular_units
-        mlp_layers = []
-        input_dim = self.config.d_model  # Initial input dimension
-
-        # Iterate over the specified units for each layer in the MLP
-        for units in self.config.tabular_head_units:
-            mlp_layers.append(nn.Linear(input_dim, units))
-            mlp_layers.append(mlp_activation_fn)
-            mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout))
-            input_dim = units
+        head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        # Add the final linear layer to map to a single output value
-        mlp_layers.append(nn.Linear(input_dim, 1))
-
-        # Combine all layers into a Sequential module
-        self.tabular_head = nn.Sequential(*mlp_layers)
+        self.tabular_head = MLP(
+            self.hparams.get("d_model", config.d_model),
+            hidden_units_list=self.hparams.get(
+                "head_layer_sizes", config.head_layer_sizes
+            ),
+            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
+            use_skip_layers=self.hparams.get(
+                "head_skip_layers", config.head_skip_layers
+            ),
+            activation_fn=head_activation,
+            use_batch_norm=self.hparams.get(
+                "head_use_batch_norm", config.head_use_batch_norm
+            ),
+        )
 
-        self.pooling_method = self.config.pooling_method
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model))
+        self.cls_token = nn.Parameter(
+            torch.zeros(1, 1, self.hparams.get("d_model", config.d_model))
+        )
 
         self.loss_fct = nn.MSELoss()
 
-        if self.config.layer_norm_after_embedding:
-            self.embedding_norm = nn.LayerNorm(self.config.d_model)
+        if self.hparams.get("layer_norm_after_embedding"):
+            self.embedding_norm = nn.LayerNorm(
+                self.hparams.get("d_model", config.d_model)
+            )
 
     def forward(self, cat_features, num_features):
         """
@@ -156,8 +167,7 @@ def forward(self, cat_features, num_features):
         """
 
         batch_size = (
-            cat_features[0].size(0) if cat_features != [
-            ] else num_features[0].size(0)
+            cat_features[0].size(0) if cat_features != [] else num_features[0].size(0)
         )
         cls_tokens = self.cls_token.expand(batch_size, -1, -1)
 
@@ -168,7 +178,7 @@ def forward(self, cat_features, num_features):
             ]
             cat_embeddings = torch.stack(cat_embeddings, dim=1)
             cat_embeddings = torch.squeeze(cat_embeddings, dim=2)
-            if self.config.layer_norm_after_embedding:
+            if self.hparams.get("layer_norm_after_embedding"):
                 cat_embeddings = self.embedding_norm(cat_embeddings)
         else:
             cat_embeddings = None
@@ -179,7 +189,7 @@ def forward(self, cat_features, num_features):
                 emb(num_features[i]) for i, emb in enumerate(self.num_embeddings)
             ]
             num_embeddings = torch.stack(num_embeddings, dim=1)
-            if self.config.layer_norm_after_embedding:
+            if self.hparams.get("layer_norm_after_embedding"):
                 num_embeddings = self.embedding_norm(num_embeddings)
         else:
             num_embeddings = None
@@ -209,7 +219,7 @@ def forward(self, cat_features, num_features):
         else:
             raise ValueError(f"Invalid pooling method: {self.pooling_method}")
 
-        x = self.norm_f(x)
+        x = self.norm_f.forward(x)
         preds = self.tabular_head(x)
 
         return preds
@@ -281,7 +291,7 @@ def configure_optimizers(self):
             A dictionary containing the optimizer and lr_scheduler configurations.
         """
         optimizer = torch.optim.Adam(
-            self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay
+            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
         )
         scheduler = {
             "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
diff --git a/mambular/models/sklearn_regressor.py b/mambular/models/sklearn_regressor.py
index ad23fd6..c75f5d8 100644
--- a/mambular/models/sklearn_regressor.py
+++ b/mambular/models/sklearn_regressor.py
@@ -6,30 +6,94 @@
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split
 from torch.utils.data import DataLoader
+import warnings
 
 from ..base_models.regressor import BaseMambularRegressor
-from ..utils.config import MambularConfig
 from ..utils.dataset import MambularDataModule, MambularDataset
 from ..utils.preprocessor import Preprocessor
+from ..utils.default_mamba_params import DefaultConfig
 
 
 class MambularRegressor(BaseEstimator):
     """
-    A regressor implemented using PyTorch Lightning that follows the scikit-learn API conventions. This class is designed
-    to work with tabular data, offering a straightforward way to specify model configurations and preprocessing steps. It
-    integrates seamlessly with scikit-learn's tools such as cross-validation and grid search.
+    A regressor implemented using PyTorch Lightning that follows the scikit-learn API conventions.
+    This class is designed to work with tabular data, offering a straightforward way to specify
+    model configurations and preprocessing steps. It integrates seamlessly with scikit-learn's tools
+    such as cross-validation and grid search.
 
     Parameters
     ----------
-    **kwargs : Various
-        Accepts any number of keyword arguments. Arguments recognized as model configuration options are passed to the
-        MambularConfig constructor. Remaining arguments are assumed to be preprocessor options and passed to the
-        Preprocessor constructor.
+    # configuration parameters
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 penalty) coefficient. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate will be reduced. Default is 0.1.
+    d_model : int, optional
+        Dimension of the model. Default is 64.
+    n_layers : int, optional
+        Number of layers. Default is 8.
+    expand_factor : int, optional
+        Expansion factor. Default is 2.
+    bias : bool, optional
+        Whether to use bias. Default is False.
+    d_conv : int, optional
+        Dimension of the convolution. Default is 16.
+    conv_bias : bool, optional
+        Whether to use bias in the convolution. Default is True.
+    dropout : float, optional
+        Dropout rate in the mamba blocks. Default is 0.05.
+    dt_rank : str, optional
+        Rank of the time dimension. Default is "auto".
+    d_state : int, optional
+        State dimension. Default is 16.
+    dt_scale : float, optional
+        Scale of the time dimension. Default is 1.0.
+    dt_init : str, optional
+        Initialization method for the time dimension. Default is "random".
+    dt_max : float, optional
+        Maximum value for the time dimension. Default is 0.1.
+    dt_min : float, optional
+        Minimum value for the time dimension. Default is 1e-3.
+    dt_init_floor : float, optional
+        Floor value for the time dimension initialization. Default is 1e-4.
+    norm : str, optional
+        Normalization method. Default is 'RMSNorm'.
+    activation : callable, optional
+        Activation function. Default is nn.SELU().
+    num_embedding_activation : callable, optional
+        Activation function for numerical embeddings. Default is nn.Identity().
+    head_layer_sizes : list, optional
+        Sizes of the layers in the head. Default is [64, 64, 32].
+    head_dropout : float, optional
+        Dropout rate for the head. Default is 0.5.
+    head_skip_layers : bool, optional
+        Whether to use skip layers in the head. Default is False.
+    head_activation : callable, optional
+        Activation function for the head. Default is nn.SELU().
+    head_use_batch_norm : bool, optional
+        Whether to use batch normalization in the head. Default is False.
+
+    # Preprocessor Parameters
+    n_bins : int, optional
+        The number of bins to use for numerical feature binning. Default is 50.
+    numerical_preprocessing : str, optional
+        The preprocessing strategy for numerical features. Default is 'ple'.
+    use_decision_tree_bins : bool, optional
+        If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False.
+    binning_strategy : str, optional
+        Defines the strategy for binning numerical features. Default is 'uniform'.
+    task : str, optional
+        Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'.
+
 
 
     Attributes
     ----------
-    config : MambularConfig
+    config : DefaultConfig
         An object storing the configuration settings for the model.
     preprocessor : Preprocessor
         An object responsible for preprocessing the input data, such as encoding categorical variables and scaling numerical features.
@@ -39,44 +103,60 @@ class MambularRegressor(BaseEstimator):
 
     def __init__(self, **kwargs):
         # Known config arguments
-        print("Received kwargs:", kwargs)
         config_arg_names = [
+            "lr",
+            "lr_patience",
+            "weight_decay",
+            "lr_factor",
             "d_model",
             "n_layers",
-            "dt_rank",
-            "output_dimension",
-            "pooling_method",
-            "norm",
-            "cls",
-            "dt_min",
-            "dt_max",
-            "dropout",
+            "expand_factor",
             "bias",
-            "weight_decay",
+            "d_conv",
             "conv_bias",
+            "dropout",
+            "dt_rank",
             "d_state",
-            "expand_factor",
-            "d_conv",
-            "dt_init",
             "dt_scale",
+            "dt_init",
+            "dt_max",
+            "dt_min",
             "dt_init_floor",
-            "tabular_head_units",
-            "tabular_head_activation",
-            "tabular_head_dropout",
-            "num_emebedding_activation",
-            "layer_norm_after_embedding",
+            "norm",
+            "activation",
+            "num_embedding_activation",
+            "head_layer_sizes",
+            "head_dropout",
+            "head_skip_layers",
+            "head_activation",
+            "head_use_batch_norm",
+        ]
+
+        preprocessor_arg_names = [
+            "n_bins",
+            "numerical_preprocessing",
+            "use_decision_tree_bins",
+            "binning_strategy",
+            "task",
         ]
-        self.config_kwargs = {k: v for k,
-                              v in kwargs.items() if k in config_arg_names}
-        self.config = MambularConfig(**self.config_kwargs)
 
-        # The rest are assumed to be preprocessor arguments
+        self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names}
+        self.config = DefaultConfig(**self.config_kwargs)
+
         preprocessor_kwargs = {
-            k: v for k, v in kwargs.items() if k not in config_arg_names
+            k: v for k, v in kwargs.items() if k in preprocessor_arg_names
         }
+
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.model = None
 
+        # Raise a warning if task is set to 'classification'
+        if preprocessor_kwargs.get("task") == "classification":
+            warnings.warn(
+                "The task is set to 'classification'. MambularRegressor is designed for regression tasks.",
+                UserWarning,
+            )
+
     def get_params(self, deep=True):
         """
         Get parameters for this estimator. Overrides the BaseEstimator method.
@@ -86,13 +166,12 @@ def get_params(self, deep=True):
         deep : bool, default=True
             If True, returns the parameters for this estimator and contained sub-objects that are estimators.
 
-
         Returns
         -------
         params : dict
             Parameter names mapped to their values.
         """
-        params = self.config_kwargs  # Parameters used to initialize MambularConfig
+        params = self.config_kwargs  # Parameters used to initialize DefaultConfig
 
         # If deep=True, include parameters from nested components like preprocessor
         if deep:
@@ -114,7 +193,6 @@ def set_params(self, **parameters):
         **parameters : dict
             Estimator parameters to be set.
 
-
         Returns
         -------
         self : object
@@ -122,8 +200,7 @@ def set_params(self, **parameters):
         """
         # Update config_kwargs with provided parameters
         valid_config_keys = self.config_kwargs.keys()
-        config_updates = {k: v for k,
-                          v in parameters.items() if k in valid_config_keys}
+        config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys}
         self.config_kwargs.update(config_updates)
 
         # Update the config object
@@ -194,8 +271,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
         data_module : MambularDataModule
             An instance of MambularDataModule containing the training and validation DataLoaders.
         """
-        train_preprocessed_data = self.preprocessor.fit_transform(
-            X_train, y_train)
+        train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train)
         val_preprocessed_data = self.preprocessor.transform(X_val)
 
         # Update feature info based on the actual processed data
@@ -215,26 +291,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             cat_key = "cat_" + key  # Assuming categorical keys are prefixed with 'cat_'
             if cat_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long)
                 )
             if cat_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long)
                 )
 
             binned_key = "num_" + key  # for binned features
             if binned_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
             if binned_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
         # Populate tensors for numerical features, if present in processed data
@@ -242,13 +314,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             num_key = "num_" + key  # Assuming numerical keys are prefixed with 'num_'
             if num_key in train_preprocessed_data:
                 train_num_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(train_preprocessed_data[num_key], dtype=torch.float)
                 )
             if num_key in val_preprocessed_data:
                 val_num_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(val_preprocessed_data[num_key], dtype=torch.float)
                 )
 
         train_labels = torch.tensor(y_train, dtype=torch.float)
@@ -258,8 +328,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
         train_dataset = MambularDataset(
             train_cat_tensors, train_num_tensors, train_labels
         )
-        val_dataset = MambularDataset(
-            val_cat_tensors, val_num_tensors, val_labels)
+        val_dataset = MambularDataset(val_cat_tensors, val_num_tensors, val_labels)
 
         # Create dataloaders
         train_dataloader = DataLoader(
@@ -320,20 +389,20 @@ def fit(
         self,
         X,
         y,
-        val_size=0.2,
+        val_size: float = 0.2,
         X_val=None,
         y_val=None,
-        max_epochs=100,
-        random_state=101,
-        batch_size=128,
-        shuffle=True,
-        patience=10,
-        monitor="val_loss",
-        mode="min",
-        lr=1e-3,
-        lr_patience=5,
-        factor=0.75,
-        weight_decay=1e-06,
+        max_epochs: int = 100,
+        random_state: int = 101,
+        batch_size: int = 128,
+        shuffle: bool = True,
+        patience: int = 15,
+        monitor: str = "val_loss",
+        mode: str = "min",
+        lr: float = 1e-4,
+        lr_patience: int = 10,
+        factor: float = 0.1,
+        weight_decay: float = 1e-06,
         **trainer_kwargs
     ):
         """
@@ -369,7 +438,7 @@ def fit(
             Learning rate for the optimizer.
         lr_patience : int, default=10
             Number of epochs with no improvement on the validation loss to wait before reducing the learning rate.
-        factor : float, default=0.75
+        factor : float, default=0.1
             Factor by which the learning rate will be reduced.
         weight_decay : float, default=0.025
             Weight decay (L2 penalty) coefficient.
diff --git a/mambular/utils/default_mamba_params.py b/mambular/utils/default_mamba_params.py
new file mode 100644
index 0000000..f46e4c4
--- /dev/null
+++ b/mambular/utils/default_mamba_params.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+import torch.nn as nn
+
+
+@dataclass
+class DefaultConfig:
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+    d_model: int = 64
+    n_layers: int = 8
+    expand_factor: int = 2
+    bias: bool = False
+    d_conv: int = 16
+    conv_bias: bool = True
+    dropout: float = 0.05
+    dt_rank: str = "auto"
+    d_state: int = 32
+    dt_scale: float = 1.0
+    dt_init: str = "random"
+    dt_max: float = 0.1
+    dt_min: float = 1e-04
+    dt_init_floor: float = 1e-04
+    norm: str = "RMSNorm"
+    activation: callable = nn.SELU()
+    num_embedding_activation: callable = nn.Identity()
+    head_layer_sizes: list = (128, 64, 32)
+    head_dropout: float = 0.5
+    head_skip_layers: bool = False
+    head_activation: callable = nn.SELU()
+    head_use_batch_norm: bool = (False,)
+    layer_norm_after_embedding: bool = (False,)
+    pooling_method: str = "avg"
diff --git a/mambular/utils/mamba_arch.py b/mambular/utils/mamba_arch.py
index 2e7ca7c..a1eb830 100644
--- a/mambular/utils/mamba_arch.py
+++ b/mambular/utils/mamba_arch.py
@@ -2,7 +2,14 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from .config import MambularConfig
+from .normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
 
 
 ### Heavily inspired and mostly taken from https://github.com/alxndrTL/mamba.py
@@ -16,13 +23,48 @@ class Mamba(nn.Module):
         layers (nn.ModuleList): List of MambaBlocks constituting the model.
     """
 
-    def __init__(self, config: MambularConfig):
+    def __init__(
+        self,
+        d_model=32,
+        n_layers=8,
+        expand_factor=2,
+        bias=False,
+        d_conv=8,
+        conv_bias=True,
+        dropout=0.01,
+        dt_rank="auto",
+        d_state=16,
+        dt_scale=1.0,
+        dt_init="random",
+        dt_max=0.1,
+        dt_min=1e-03,
+        dt_init_floor=1e-04,
+        norm=RMSNorm,
+        activation=F.silu,
+    ):
         super().__init__()
 
-        self.config = config
-
         self.layers = nn.ModuleList(
-            [ResidualBlock(config) for _ in range(config.n_layers)]
+            [
+                ResidualBlock(
+                    d_model,
+                    expand_factor,
+                    bias,
+                    d_conv,
+                    conv_bias,
+                    dropout,
+                    dt_rank,
+                    d_state,
+                    dt_scale,
+                    dt_init,
+                    dt_max,
+                    dt_min,
+                    dt_init_floor,
+                    norm,
+                    activation,
+                )
+                for _ in range(n_layers)
+            ]
         )
 
     def forward(self, x):
@@ -40,11 +82,67 @@ class ResidualBlock(nn.Module):
         norm (RMSNorm): Normalization layer.
     """
 
-    def __init__(self, config: MambularConfig):
+    def __init__(
+        self,
+        d_model=32,
+        expand_factor=2,
+        bias=False,
+        d_conv=16,
+        conv_bias=True,
+        dropout=0.01,
+        dt_rank="auto",
+        d_state=32,
+        dt_scale=1.0,
+        dt_init="random",
+        dt_max=0.1,
+        dt_min=1e-03,
+        dt_init_floor=1e-04,
+        norm=RMSNorm,
+        activation=F.silu,
+    ):
         super().__init__()
 
-        self.layers = MambaBlock(config)
-        self.norm = config.norm(config.d_model)
+        VALID_NORMALIZATION_LAYERS = {
+            "RMSNorm": RMSNorm,
+            "LayerNorm": LayerNorm,
+            "LearnableLayerScaling": LearnableLayerScaling,
+            "BatchNorm": BatchNorm,
+            "InstanceNorm": InstanceNorm,
+            "GroupNorm": GroupNorm,
+        }
+
+        # Check if the provided normalization layer is valid
+        if isinstance(norm, type) and norm.__name__ not in VALID_NORMALIZATION_LAYERS:
+            raise ValueError(
+                f"Invalid normalization layer: {norm.__name__}. "
+                f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}"
+            )
+        elif isinstance(norm, str) and norm not in self.VALID_NORMALIZATION_LAYERS:
+            raise ValueError(
+                f"Invalid normalization layer: {norm}. "
+                f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}"
+            )
+
+        if dt_rank == "auto":
+            dt_rank = math.ceil(d_model / 16)
+
+        self.layers = MambaBlock(
+            d_model=d_model,
+            expand_factor=expand_factor,
+            bias=bias,
+            d_conv=d_conv,
+            conv_bias=conv_bias,
+            dropout=dropout,
+            dt_rank=dt_rank,
+            d_state=d_state,
+            dt_scale=dt_scale,
+            dt_init=dt_init,
+            dt_max=dt_max,
+            dt_min=dt_min,
+            dt_init_floor=dt_init_floor,
+            activation=activation,
+        )
+        self.norm = norm(d_model)
 
     def forward(self, x):
         output = self.layers(self.norm(x)) + x
@@ -65,53 +163,66 @@ class MambaBlock(nn.Module):
         out_proj (nn.Linear): Linear projection for output.
     """
 
-    def __init__(self, config: MambularConfig):
+    def __init__(
+        self,
+        d_model=32,
+        expand_factor=2,
+        bias=False,
+        d_conv=16,
+        conv_bias=True,
+        dropout=0.01,
+        dt_rank="auto",
+        d_state=32,
+        dt_scale=1.0,
+        dt_init="random",
+        dt_max=0.1,
+        dt_min=1e-03,
+        dt_init_floor=1e-04,
+        activation=F.silu,
+    ):
         super().__init__()
+        self.d_inner = d_model * expand_factor
 
-        self.config = config
-
-        self.in_proj = nn.Linear(config.d_model, 2 * config.d_inner, bias=config.bias)
+        self.in_proj = nn.Linear(d_model, 2 * self.d_inner, bias=bias)
 
         self.conv1d = nn.Conv1d(
-            in_channels=config.d_inner,
-            out_channels=config.d_inner,
-            kernel_size=config.d_conv,
-            bias=config.conv_bias,
-            groups=config.d_inner,
-            padding=config.d_conv - 1,
+            in_channels=self.d_inner,
+            out_channels=self.d_inner,
+            kernel_size=d_conv,
+            bias=conv_bias,
+            groups=self.d_inner,
+            padding=d_conv - 1,
         )
 
-        self.dropout = nn.Dropout(config.dropout)
+        self.dropout = nn.Dropout(dropout)
+        self.activation = activation
 
-        self.x_proj = nn.Linear(
-            config.d_inner, config.dt_rank + 2 * config.d_state, bias=False
-        )
+        self.x_proj = nn.Linear(self.d_inner, dt_rank + 2 * d_state, bias=False)
 
-        self.dt_proj = nn.Linear(config.dt_rank, config.d_inner, bias=True)
+        self.dt_proj = nn.Linear(dt_rank, self.d_inner, bias=True)
 
-        dt_init_std = config.dt_rank**-0.5 * config.dt_scale
-        if config.dt_init == "constant":
+        dt_init_std = dt_rank**-0.5 * dt_scale
+        if dt_init == "constant":
             nn.init.constant_(self.dt_proj.weight, dt_init_std)
-        elif config.dt_init == "random":
+        elif dt_init == "random":
             nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
         else:
             raise NotImplementedError
 
         dt = torch.exp(
-            torch.rand(config.d_inner)
-            * (math.log(config.dt_max) - math.log(config.dt_min))
-            + math.log(config.dt_min)
-        ).clamp(min=config.dt_init_floor)
+            torch.rand(self.d_inner) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        ).clamp(min=dt_init_floor)
         inv_dt = dt + torch.log(-torch.expm1(-dt))
         with torch.no_grad():
             self.dt_proj.bias.copy_(inv_dt)
 
-        A = torch.arange(1, config.d_state + 1, dtype=torch.float32).repeat(
-            config.d_inner, 1
-        )
+        A = torch.arange(1, d_state + 1, dtype=torch.float32).repeat(self.d_inner, 1)
         self.A_log = nn.Parameter(torch.log(A))
-        self.D = nn.Parameter(torch.ones(config.d_inner))
-        self.out_proj = nn.Linear(config.d_inner, config.d_model, bias=config.bias)
+        self.D = nn.Parameter(torch.ones(self.d_inner))
+        self.out_proj = nn.Linear(self.d_inner, d_model, bias=bias)
+        self.dt_rank = dt_rank
+        self.d_state = d_state
 
     def forward(self, x):
         _, L, _ = x.shape
@@ -123,11 +234,11 @@ def forward(self, x):
         x = self.conv1d(x)[:, :, :L]
         x = x.transpose(1, 2)
 
-        x = F.silu(x)
+        x = self.activation(x)
         x = self.dropout(x)
         y = self.ssm(x)
 
-        z = F.silu(z)
+        z = self.activation(z)
         z = self.dropout(z)
 
         output = y * z
@@ -143,7 +254,7 @@ def ssm(self, x):
 
         delta, B, C = torch.split(
             deltaBC,
-            [self.config.dt_rank, self.config.d_state, self.config.d_state],
+            [self.dt_rank, self.d_state, self.d_state],
             dim=-1,
         )
         delta = F.softplus(self.dt_proj(delta))
@@ -160,9 +271,7 @@ def selective_scan_seq(self, x, delta, A, B, C, D):
 
         BX = deltaB * (x.unsqueeze(-1))
 
-        h = torch.zeros(
-            x.size(0), self.config.d_inner, self.config.d_state, device=deltaA.device
-        )
+        h = torch.zeros(x.size(0), self.d_inner, self.d_state, device=deltaA.device)
         hs = []
 
         for t in range(0, L):
diff --git a/mambular/utils/normalization_layers.py b/mambular/utils/normalization_layers.py
index 817a2cd..5237177 100644
--- a/mambular/utils/normalization_layers.py
+++ b/mambular/utils/normalization_layers.py
@@ -15,7 +15,6 @@ class RMSNorm(nn.Module):
 
     def __init__(self, d_model: int, eps: float = 1e-5):
         super().__init__()
-
         self.eps = eps
         self.weight = nn.Parameter(torch.ones(d_model))
 
diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py
index 767e4f1..c443ed1 100644
--- a/mambular/utils/preprocessor.py
+++ b/mambular/utils/preprocessor.py
@@ -53,8 +53,8 @@ class Preprocessor:
 
     def __init__(
         self,
-        n_bins=200,
-        numerical_preprocessing="binning",
+        n_bins=50,
+        numerical_preprocessing="ple",
         use_decision_tree_bins=False,
         binning_strategy="uniform",
         task="regression",

From 19fd5ea3c20337fae4ea1fc10630c60a6d0deab0 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 12:13:36 +0000
Subject: [PATCH 05/21] include configs in single file

---
 mambular/utils/configs.py | 57 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 mambular/utils/configs.py

diff --git a/mambular/utils/configs.py b/mambular/utils/configs.py
new file mode 100644
index 0000000..821d23e
--- /dev/null
+++ b/mambular/utils/configs.py
@@ -0,0 +1,57 @@
+from dataclasses import dataclass
+import torch.nn as nn
+
+
+@dataclass
+class DefaultMambularConfig:
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+    d_model: int = 64
+    n_layers: int = 8
+    expand_factor: int = 2
+    bias: bool = False
+    d_conv: int = 16
+    conv_bias: bool = True
+    dropout: float = 0.05
+    dt_rank: str = "auto"
+    d_state: int = 32
+    dt_scale: float = 1.0
+    dt_init: str = "random"
+    dt_max: float = 0.1
+    dt_min: float = 1e-04
+    dt_init_floor: float = 1e-04
+    norm: str = "RMSNorm"
+    activation: callable = nn.SELU()
+    num_embedding_activation: callable = nn.Identity()
+    head_layer_sizes: list = (128, 64, 32)
+    head_dropout: float = 0.5
+    head_skip_layers: bool = False
+    head_activation: callable = nn.SELU()
+    head_use_batch_norm: bool = (False,)
+    layer_norm_after_embedding: bool = (False,)
+    pooling_method: str = "avg"
+
+
+@dataclass
+class DefaultFTTransformerConfig:
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+    d_model: int = 64
+    n_layers: int = 8
+    n_heads: int = 4
+    attn_dropout: float = 0.3
+    ff_dropout: float = 0.3
+    norm: str = "RMSNorm"
+    activation: callable = nn.SELU()
+    num_embedding_activation: callable = nn.Identity()
+    head_layer_sizes: list = (128, 64, 32)
+    head_dropout: float = 0.5
+    head_skip_layers: bool = False
+    head_activation: callable = nn.SELU()
+    head_use_batch_norm: bool = (False,)
+    layer_norm_after_embedding: bool = (False,)
+    pooling_method: str = "avg"

From 339cd6eaa5dfa6ed5c41acf4eb60e0c811b69755 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 12:13:46 +0000
Subject: [PATCH 06/21] delete current helper default params

---
 mambular/utils/default_mamba_params.py | 34 --------------------------
 1 file changed, 34 deletions(-)
 delete mode 100644 mambular/utils/default_mamba_params.py

diff --git a/mambular/utils/default_mamba_params.py b/mambular/utils/default_mamba_params.py
deleted file mode 100644
index f46e4c4..0000000
--- a/mambular/utils/default_mamba_params.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from dataclasses import dataclass
-import torch.nn as nn
-
-
-@dataclass
-class DefaultConfig:
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-    d_model: int = 64
-    n_layers: int = 8
-    expand_factor: int = 2
-    bias: bool = False
-    d_conv: int = 16
-    conv_bias: bool = True
-    dropout: float = 0.05
-    dt_rank: str = "auto"
-    d_state: int = 32
-    dt_scale: float = 1.0
-    dt_init: str = "random"
-    dt_max: float = 0.1
-    dt_min: float = 1e-04
-    dt_init_floor: float = 1e-04
-    norm: str = "RMSNorm"
-    activation: callable = nn.SELU()
-    num_embedding_activation: callable = nn.Identity()
-    head_layer_sizes: list = (128, 64, 32)
-    head_dropout: float = 0.5
-    head_skip_layers: bool = False
-    head_activation: callable = nn.SELU()
-    head_use_batch_norm: bool = (False,)
-    layer_norm_after_embedding: bool = (False,)
-    pooling_method: str = "avg"

From fc4c5de07316aa5fe31cb4307786262682bee3ff Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 12:13:57 +0000
Subject: [PATCH 07/21] delete former activation mapping

---
 mambular/base_models/regressor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mambular/base_models/regressor.py b/mambular/base_models/regressor.py
index 2ff671c..34d5283 100644
--- a/mambular/base_models/regressor.py
+++ b/mambular/base_models/regressor.py
@@ -11,7 +11,7 @@
     InstanceNorm,
     GroupNorm,
 )
-from ..utils.default_mamba_params import DefaultConfig
+from ..utils.configs import DefaultMambularConfig
 
 
 class BaseMambularRegressor(pl.LightningModule):
@@ -19,7 +19,7 @@ def __init__(
         self,
         cat_feature_info,
         num_feature_info,
-        config: DefaultConfig = DefaultConfig(),
+        config: DefaultMambularConfig = DefaultMambularConfig(),
         **kwargs,
     ):
         super().__init__()

From dee7c0ae716867b499ca1baad062d9c041204007 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 12:14:03 +0000
Subject: [PATCH 08/21] adjust config import

---
 mambular/models/sklearn_regressor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mambular/models/sklearn_regressor.py b/mambular/models/sklearn_regressor.py
index c75f5d8..6a1ca1b 100644
--- a/mambular/models/sklearn_regressor.py
+++ b/mambular/models/sklearn_regressor.py
@@ -11,7 +11,7 @@
 from ..base_models.regressor import BaseMambularRegressor
 from ..utils.dataset import MambularDataModule, MambularDataset
 from ..utils.preprocessor import Preprocessor
-from ..utils.default_mamba_params import DefaultConfig
+from ..utils.configs import DefaultMambularConfig
 
 
 class MambularRegressor(BaseEstimator):
@@ -141,7 +141,7 @@ def __init__(self, **kwargs):
         ]
 
         self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names}
-        self.config = DefaultConfig(**self.config_kwargs)
+        self.config = DefaultMambularConfig(**self.config_kwargs)
 
         preprocessor_kwargs = {
             k: v for k, v in kwargs.items() if k in preprocessor_arg_names

From 9e71ddbe671e2f568cecea0535d5f90dfe852821 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 14:21:29 +0000
Subject: [PATCH 09/21] include documentation

---
 mambular/base_models/regressor.py | 65 ++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 14 deletions(-)

diff --git a/mambular/base_models/regressor.py b/mambular/base_models/regressor.py
index 34d5283..4171042 100644
--- a/mambular/base_models/regressor.py
+++ b/mambular/base_models/regressor.py
@@ -15,6 +15,56 @@
 
 
 class BaseMambularRegressor(pl.LightningModule):
+    """
+    A PyTorch Lightning Module for regression tasks utilizing the Mamba architecture and various normalization techniques.
+
+    Parameters
+    ----------
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    config : DefaultMambularConfig, optional
+        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
+    **kwargs : dict
+        Additional keyword arguments.
+
+    Attributes
+    ----------
+    lr : float
+        Learning rate.
+    lr_patience : int
+        Patience for learning rate scheduler.
+    weight_decay : float
+        Weight decay for optimizer.
+    lr_factor : float
+        Factor by which the learning rate will be reduced.
+    pooling_method : str
+        Method to pool the features.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    embedding_activation : callable
+        Activation function for embeddings.
+    mamba : Mamba
+        Mamba architecture component.
+    norm_f : nn.Module
+        Normalization layer.
+    num_embeddings : nn.ModuleList
+        Module list for numerical feature embeddings.
+    cat_embeddings : nn.ModuleList
+        Module list for categorical feature embeddings.
+    tabular_head : MLP
+        Multi-layer perceptron head for tabular data.
+    cls_token : nn.Parameter
+        Class token parameter.
+    loss_fct : nn.Module
+        Loss function.
+    embedding_norm : nn.Module, optional
+        Layer normalization applied after embedding if specified.
+    """
+
     def __init__(
         self,
         cat_feature_info,
@@ -36,19 +86,6 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        activations = {
-            "relu": nn.ReLU(),
-            "tanh": nn.Tanh(),
-            "sigmoid": nn.Sigmoid(),
-            "leaky_relu": nn.LeakyReLU(),
-            "elu": nn.ELU(),
-            "selu": nn.SELU(),
-            "gelu": nn.GELU(),
-            "softplus": nn.Softplus(),
-            "linear": nn.Identity(),
-            "silu": nn.functional.silu,
-        }
-
         self.embedding_activation = self.hparams.get(
             "num_embedding_activation", config.num_embedding_activation
         )
@@ -219,7 +256,7 @@ def forward(self, cat_features, num_features):
         else:
             raise ValueError(f"Invalid pooling method: {self.pooling_method}")
 
-        x = self.norm_f.forward(x)
+        x = self.norm_f(x)
         preds = self.tabular_head(x)
 
         return preds

From 8d9eebcf823be27f9c97163c2de16af3ee71ffde Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 14:21:43 +0000
Subject: [PATCH 10/21] adapt to new config + fix prediction for binary

---
 mambular/base_models/classifier.py | 227 ++++++++++++++++++-----------
 1 file changed, 141 insertions(+), 86 deletions(-)

diff --git a/mambular/base_models/classifier.py b/mambular/base_models/classifier.py
index 3fe299d..8c818a7 100644
--- a/mambular/base_models/classifier.py
+++ b/mambular/base_models/classifier.py
@@ -2,9 +2,17 @@
 import torch
 import torch.nn as nn
 import torchmetrics
-
-from ..utils.config import MambularConfig
 from ..utils.mamba_arch import Mamba
+from ..utils.mlp_utils import MLP
+from ..utils.normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
+from ..utils.configs import DefaultMambularConfig
 
 
 class BaseMambularClassifier(pl.LightningModule):
@@ -17,41 +25,49 @@ class BaseMambularClassifier(pl.LightningModule):
     Parameters
     ----------
     num_classes : int
-        The number of classes in the classification task. For binary classification, this should be 2.
-    config : MambularConfig
-        An instance of MambularConfig containing configuration parameters for the Mambular model.
-    cat_feature_info : dict, optional
-        A dictionary mapping the names of categorical features to their number of unique categories.
-        This information is used to configure embedding layers for categorical features. Defaults to None.
-    num_feature_info : dict, optional
-        A dictionary mapping the names of numerical features to the size of their input dimensions.
-        This information is used to configure embedding layers for numerical features. Defaults to None.
-    lr : float, optional
-        The learning rate for the optimizer. Defaults to 1e-03.
-    lr_patience : int, optional
-        The number of epochs with no improvement after which learning rate will be reduced. Defaults to 10.
-    weight_decay : float, optional
-        Weight decay (L2 penalty) parameter for the optimizer. Defaults to 0.025.
-    lr_factor : float, optional
-        Factor by which the learning rate will be reduced. Defaults to 0.75.
-
-
-    Attributes
+        number of classes for classification.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    config : DefaultMambularConfig, optional
+        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
+    **kwargs : dict
+        Additional keyword arguments.
+
+
+     Attributes
     ----------
-    embedding_activation : nn.Module
-        The activation function to be applied after the linear transformation of numerical features.
-    num_embeddings : nn.ModuleList
-        A list of sequential modules, each corresponding to an embedding layer for a numerical feature.
-    cat_embeddings : nn.ModuleList
-        A list of embedding layers, each corresponding to a categorical feature.
+    lr : float
+        Learning rate.
+    lr_patience : int
+        Patience for learning rate scheduler.
+    weight_decay : float
+        Weight decay for optimizer.
+    lr_factor : float
+        Factor by which the learning rate will be reduced.
+    pooling_method : str
+        Method to pool the features.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    embedding_activation : callable
+        Activation function for embeddings.
     mamba : Mamba
-        The Mambular model for processing sequences of embeddings.
+        Mamba architecture component.
     norm_f : nn.Module
-        A normalization layer applied after the Mambular model.
-    tabular_head : nn.Linear
-        A linear layer for predicting the class labels from the aggregated embedding representation.
-    pooling_method : str
-        The method used to aggregate embeddings across features. Supported methods are 'avg', 'max', and 'sum'.
+        Normalization layer.
+    num_embeddings : nn.ModuleList
+        Module list for numerical feature embeddings.
+    cat_embeddings : nn.ModuleList
+        Module list for categorical feature embeddings.
+    tabular_head : MLP
+        Multi-layer perceptron head for tabular data.
+    cls_token : nn.Parameter
+        Class token parameter.
+    embedding_norm : nn.Module, optional
+        Layer normalization applied after embedding if specified.
     loss_fct : nn.Module
         The loss function used for training the model, configured based on the number of classes.
     acc : torchmetrics.Accuracy
@@ -66,90 +82,125 @@ class BaseMambularClassifier(pl.LightningModule):
     def __init__(
         self,
         num_classes,
-        config: MambularConfig,
-        cat_feature_info: dict = None,
-        num_feature_info: dict = None,
-        lr=1e-03,
-        lr_patience=10,
-        weight_decay=0.025,
-        lr_factor=0.75,
+        cat_feature_info,
+        num_feature_info,
+        config: DefaultMambularConfig = DefaultMambularConfig(),
+        **kwargs,
     ):
         super().__init__()
 
-        self.config = config
         self.num_classes = 1 if num_classes == 2 else num_classes
-        self.lr = lr
-        self.lr_patience = lr_patience
-        self.weight_decay = weight_decay
-        self.lr_factor = lr_factor
+        # Save all hyperparameters
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        # Assigning values from hyperparameters
+        self.lr = self.hparams.get("lr", config.lr)
+        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
+        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
+        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        activations = {
-            "relu": nn.ReLU(),
-            "tanh": nn.Tanh(),
-            "sigmoid": nn.Sigmoid(),
-            "leaky_relu": nn.LeakyReLU(),
-            "elu": nn.ELU(),
-            "selu": nn.SELU(),
-            "gelu": nn.GELU(),
-            "softplus": nn.Softplus(),
-            "leakyrelu": nn.LeakyReLU(),
-            "linear": nn.Identity(),
-        }
+        self.embedding_activation = self.hparams.get(
+            "num_embedding_activation", config.num_embedding_activation
+        )
 
-        self.embedding_activation = activations.get(
-            self.config.num_embedding_activation.lower()
+        # Additional layers and components initialization based on hyperparameters
+        self.mamba = Mamba(
+            d_model=self.hparams.get("d_model", config.d_model),
+            n_layers=self.hparams.get("n_layers", config.n_layers),
+            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
+            bias=self.hparams.get("bias", config.bias),
+            d_conv=self.hparams.get("d_conv", config.d_conv),
+            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
+            dropout=self.hparams.get("dropout", config.dropout),
+            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
+            d_state=self.hparams.get("d_state", config.d_state),
+            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
+            dt_init=self.hparams.get("dt_init", config.dt_init),
+            dt_max=self.hparams.get("dt_max", config.dt_max),
+            dt_min=self.hparams.get("dt_min", config.dt_min),
+            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
+            norm=globals()[self.hparams.get("norm", config.norm)],
+            activation=self.hparams.get("activation", config.activation),
         )
+
+        # Set the normalization layer dynamically
+        norm_layer = self.hparams.get("norm", config.norm)
+        if norm_layer == "RMSNorm":
+            self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LayerNorm":
+            self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "BatchNorm":
+            self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "InstanceNorm":
+            self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "GroupNorm":
+            self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LearnableLayerScaling":
+            self.norm_f = LearnableLayerScaling(
+                self.hparams.get("d_model", config.d_model)
+            )
+        else:
+            raise ValueError(f"Unsupported normalization layer: {norm_layer}")
+
         if self.embedding_activation is None:
             raise ValueError(
-                f"Unsupported activation function: {self.config.num_embedding_activation}"
+                f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}"
             )
 
         self.num_embeddings = nn.ModuleList(
             [
                 nn.Sequential(
-                    nn.Linear(input_shape, self.config.d_model, bias=False),
-                    nn.BatchNorm1d(self.config.d_model),
-                    # Example using ReLU as the activation function, change as needed
+                    nn.Linear(
+                        input_shape,
+                        self.hparams.get("d_model", config.d_model),
+                        bias=False,
+                    ),
                     self.embedding_activation,
                 )
                 for feature_name, input_shape in num_feature_info.items()
             ]
         )
 
-        # Create embedding layers for categorical features based on cat_feature_info
         self.cat_embeddings = nn.ModuleList(
             [
-                nn.Embedding(num_categories + 1, self.config.d_model)
+                nn.Embedding(
+                    num_categories + 1, self.hparams.get("d_model", config.d_model)
+                )
                 for feature_name, num_categories in cat_feature_info.items()
             ]
         )
 
-        self.mamba = Mamba(self.config)
-        self.norm_f = self.config.norm(self.config.d_model)
+        head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        mlp_activation_fn = activations.get(
-            self.config.tabular_head_activation.lower(), nn.Identity()
+        self.tabular_head = MLP(
+            self.hparams.get("d_model", config.d_model),
+            hidden_units_list=self.hparams.get(
+                "head_layer_sizes", config.head_layer_sizes
+            ),
+            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
+            use_skip_layers=self.hparams.get(
+                "head_skip_layers", config.head_skip_layers
+            ),
+            activation_fn=head_activation,
+            use_batch_norm=self.hparams.get(
+                "head_use_batch_norm", config.head_use_batch_norm
+            ),
+            n_output_units=self.num_classes,
         )
-        mlp_layers = []
-        input_dim = self.config.d_model  # Initial input dimension
-
-        # Iterate over the specified units for each layer in the MLP
-        for units in self.config.tabular_head_units:
-            mlp_layers.append(nn.Linear(input_dim, units))
-            mlp_layers.append(mlp_activation_fn)
-            mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout))
-            input_dim = units
 
-        # Add the final linear layer to map to a single output value
-        mlp_layers.append(nn.Linear(input_dim, self.num_classes))
+        self.cls_token = nn.Parameter(
+            torch.zeros(1, 1, self.hparams.get("d_model", config.d_model))
+        )
 
-        # Combine all layers into a Sequential module
-        self.tabular_head = nn.Sequential(*mlp_layers)
+        self.loss_fct = nn.MSELoss()
 
-        self.pooling_method = self.config.pooling_method
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model))
+        if self.hparams.get("layer_norm_after_embedding"):
+            self.embedding_norm = nn.LayerNorm(
+                self.hparams.get("d_model", config.d_model)
+            )
 
         if self.num_classes > 2:
             self.loss_fct = nn.CrossEntropyLoss()
@@ -199,6 +250,8 @@ def forward(self, cat_features, num_features):
             ]
             cat_embeddings = torch.stack(cat_embeddings, dim=1)
             cat_embeddings = torch.squeeze(cat_embeddings, dim=2)
+            if self.hparams.get("layer_norm_after_embedding"):
+                cat_embeddings = self.embedding_norm(cat_embeddings)
         else:
             cat_embeddings = None
 
@@ -208,6 +261,8 @@ def forward(self, cat_features, num_features):
                 emb(num_features[i]) for i, emb in enumerate(self.num_embeddings)
             ]
             num_embeddings = torch.stack(num_embeddings, dim=1)
+            if self.hparams.get("layer_norm_after_embedding"):
+                num_embeddings = self.embedding_norm(num_embeddings)
         else:
             num_embeddings = None
 
@@ -358,7 +413,7 @@ def configure_optimizers(self):
             A dictionary containing the optimizer and lr_scheduler configurations.
         """
         optimizer = torch.optim.Adam(
-            self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay
+            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
         )
         scheduler = {
             "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(

From b83ddacf30da48f236fc25f272e6fcb08b809957 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 14:27:19 +0000
Subject: [PATCH 11/21] adjust sklearn wrapper classifier

---
 mambular/models/sklearn_classifier.py | 200 ++++++++++++++++++--------
 1 file changed, 141 insertions(+), 59 deletions(-)

diff --git a/mambular/models/sklearn_classifier.py b/mambular/models/sklearn_classifier.py
index dbfce8a..bc6626e 100644
--- a/mambular/models/sklearn_classifier.py
+++ b/mambular/models/sklearn_classifier.py
@@ -1,5 +1,6 @@
 import lightning as pl
 import numpy as np
+import warnings
 import pandas as pd
 import torch
 from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
@@ -9,7 +10,7 @@
 from torch.utils.data import DataLoader
 
 from ..base_models.classifier import BaseMambularClassifier
-from ..utils.config import MambularConfig
+from ..utils.configs import DefaultMambularConfig
 from ..utils.dataset import MambularDataModule, MambularDataset
 from ..utils.preprocessor import Preprocessor
 
@@ -24,10 +25,72 @@ class MambularClassifier(BaseEstimator):
 
     Parameters
     ----------
-    **kwargs : Various
-        Accepts any number of keyword arguments that are passed to the MambularConfig and Preprocessor classes.
-        Known configuration arguments for the model are extracted based on a predefined list, and the rest are
-        passed to the Preprocessor.
+    # configuration parameters
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 penalty) coefficient. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate will be reduced. Default is 0.1.
+    d_model : int, optional
+        Dimension of the model. Default is 64.
+    n_layers : int, optional
+        Number of layers. Default is 8.
+    expand_factor : int, optional
+        Expansion factor. Default is 2.
+    bias : bool, optional
+        Whether to use bias. Default is False.
+    d_conv : int, optional
+        Dimension of the convolution. Default is 16.
+    conv_bias : bool, optional
+        Whether to use bias in the convolution. Default is True.
+    dropout : float, optional
+        Dropout rate in the mamba blocks. Default is 0.05.
+    dt_rank : str, optional
+        Rank of the time dimension. Default is "auto".
+    d_state : int, optional
+        State dimension. Default is 16.
+    dt_scale : float, optional
+        Scale of the time dimension. Default is 1.0.
+    dt_init : str, optional
+        Initialization method for the time dimension. Default is "random".
+    dt_max : float, optional
+        Maximum value for the time dimension. Default is 0.1.
+    dt_min : float, optional
+        Minimum value for the time dimension. Default is 1e-3.
+    dt_init_floor : float, optional
+        Floor value for the time dimension initialization. Default is 1e-4.
+    norm : str, optional
+        Normalization method. Default is 'RMSNorm'.
+    activation : callable, optional
+        Activation function. Default is nn.SELU().
+    num_embedding_activation : callable, optional
+        Activation function for numerical embeddings. Default is nn.Identity().
+    head_layer_sizes : list, optional
+        Sizes of the layers in the head. Default is [64, 64, 32].
+    head_dropout : float, optional
+        Dropout rate for the head. Default is 0.5.
+    head_skip_layers : bool, optional
+        Whether to use skip layers in the head. Default is False.
+    head_activation : callable, optional
+        Activation function for the head. Default is nn.SELU().
+    head_use_batch_norm : bool, optional
+        Whether to use batch normalization in the head. Default is False.
+
+    # Preprocessor Parameters
+    n_bins : int, optional
+        The number of bins to use for numerical feature binning. Default is 50.
+    numerical_preprocessing : str, optional
+        The preprocessing strategy for numerical features. Default is 'ple'.
+    use_decision_tree_bins : bool, optional
+        If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False.
+    binning_strategy : str, optional
+        Defines the strategy for binning numerical features. Default is 'uniform'.
+    task : str, optional
+        Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'.
+
 
 
     Attributes
@@ -42,41 +105,56 @@ class MambularClassifier(BaseEstimator):
 
     def __init__(self, **kwargs):
         # Known config arguments
-        print("Received kwargs:", kwargs)
         config_arg_names = [
+            "lr",
+            "lr_patience",
+            "weight_decay",
+            "lr_factor",
             "d_model",
             "n_layers",
-            "dt_rank",
-            "output_dimension",
-            "pooling_method",
-            "norm",
-            "cls",
-            "dt_min",
-            "dt_max",
-            "dropout",
+            "expand_factor",
             "bias",
-            "weight_decay",
+            "d_conv",
             "conv_bias",
+            "dropout",
+            "dt_rank",
             "d_state",
-            "expand_factor",
-            "d_conv",
-            "dt_init",
             "dt_scale",
+            "dt_init",
+            "dt_max",
+            "dt_min",
             "dt_init_floor",
-            "tabular_head_units",
-            "tabular_head_activation",
-            "tabular_head_dropout",
-            "num_emebedding_activation",
-            "layer_norm_after_embedding",
+            "norm",
+            "activation",
+            "num_embedding_activation",
+            "head_layer_sizes",
+            "head_dropout",
+            "head_skip_layers",
+            "head_activation",
+            "head_use_batch_norm",
         ]
-        self.config_kwargs = {k: v for k,
-                              v in kwargs.items() if k in config_arg_names}
-        self.config = MambularConfig(**self.config_kwargs)
 
-        # The rest are assumed to be preprocessor arguments
+        preprocessor_arg_names = [
+            "n_bins",
+            "numerical_preprocessing",
+            "use_decision_tree_bins",
+            "binning_strategy",
+            "task",
+        ]
+        self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names}
+        self.config = DefaultMambularConfig(**self.config_kwargs)
+
         preprocessor_kwargs = {
-            k: v for k, v in kwargs.items() if k not in config_arg_names
+            k: v for k, v in kwargs.items() if k in preprocessor_arg_names
         }
+        # Raise a warning if task is set to 'classification'
+        if preprocessor_kwargs.get("task") == "regression":
+            warnings.warn(
+                "The task is set to 'regression'. MambularClassifier is designed for classification tasks. Setting the task to classification",
+                UserWarning,
+            )
+            preprocessor_kwargs["task"] = "classification"
+
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.model = None
 
@@ -126,8 +204,7 @@ def set_params(self, **parameters):
         """
         # Update config_kwargs with provided parameters
         valid_config_keys = self.config_kwargs.keys()
-        config_updates = {k: v for k,
-                          v in parameters.items() if k in valid_config_keys}
+        config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys}
         self.config_kwargs.update(config_updates)
 
         # Update the config object
@@ -199,8 +276,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             An instance of MambularDataModule containing training and validation DataLoaders.
         """
 
-        train_preprocessed_data = self.preprocessor.fit_transform(
-            X_train, y_train)
+        train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train)
         val_preprocessed_data = self.preprocessor.transform(X_val)
 
         # Update feature info based on the actual processed data
@@ -220,26 +296,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             cat_key = "cat_" + key  # Assuming categorical keys are prefixed with 'cat_'
             if cat_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long)
                 )
             if cat_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long)
                 )
 
             binned_key = "num_" + key  # for binned features
             if binned_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
             if binned_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
         # Populate tensors for numerical features, if present in processed data
@@ -247,13 +319,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             num_key = "num_" + key  # Assuming numerical keys are prefixed with 'num_'
             if num_key in train_preprocessed_data:
                 train_num_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(train_preprocessed_data[num_key], dtype=torch.float)
                 )
             if num_key in val_preprocessed_data:
                 val_num_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(val_preprocessed_data[num_key], dtype=torch.float)
                 )
 
         train_labels = torch.tensor(y_train, dtype=torch.long)
@@ -326,20 +396,20 @@ def fit(
         self,
         X,
         y,
-        val_size=0.2,
+        val_size: float = 0.2,
         X_val=None,
         y_val=None,
-        max_epochs=100,
-        random_state=101,
-        batch_size=64,
-        shuffle=True,
-        patience=10,
-        monitor="val_loss",
-        mode="min",
-        lr=1e-3,
-        lr_patience=10,
-        factor=0.75,
-        weight_decay=0.025,
+        max_epochs: int = 100,
+        random_state: int = 101,
+        batch_size: int = 128,
+        shuffle: bool = True,
+        patience: int = 15,
+        monitor: str = "val_loss",
+        mode: str = "min",
+        lr: float = 1e-4,
+        lr_patience: int = 10,
+        factor: float = 0.1,
+        weight_decay: float = 1e-06,
         **trainer_kwargs
     ):
         """
@@ -489,14 +559,23 @@ def predict(self, X):
         # Perform inference
         with torch.no_grad():
             logits = self.model(cat_tensors, num_tensors)
-            predictions = torch.argmax(logits, dim=1)
+
+            # Check the shape of the logits to determine binary or multi-class classification
+            if logits.shape[1] == 1:
+                # Binary classification
+                probabilities = torch.sigmoid(logits)
+                predictions = (probabilities > 0.5).long().squeeze()
+            else:
+                # Multi-class classification
+                probabilities = torch.softmax(logits, dim=1)
+                predictions = torch.argmax(probabilities, dim=1)
 
         # Convert predictions to NumPy array and return
         return predictions.cpu().numpy()
 
     def predict_proba(self, X):
         """
-        Predict class probabilities for the given input samples.        
+        Predict class probabilities for the given input samples.
 
         Parameters
         ----------
@@ -554,7 +633,10 @@ def predict_proba(self, X):
         # Perform inference
         with torch.no_grad():
             logits = self.model(cat_tensors, num_tensors)
-            probabilities = torch.softmax(logits, dim=1)
+            if logits.shape[1] > 1:
+                probabilities = torch.softmax(logits, dim=1)
+            else:
+                probabilities = torch.sigmoid(logits)
 
         # Convert probabilities to NumPy array and return
         return probabilities.cpu().numpy()

From 234394cf961336dc573b21e764b0091d8b7f021f Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 14:27:31 +0000
Subject: [PATCH 12/21] adjust LSS basemodel for new config file

---
 mambular/base_models/distributional.py | 232 ++++++++++++++++---------
 1 file changed, 148 insertions(+), 84 deletions(-)

diff --git a/mambular/base_models/distributional.py b/mambular/base_models/distributional.py
index 3ed84ff..0d0db5d 100644
--- a/mambular/base_models/distributional.py
+++ b/mambular/base_models/distributional.py
@@ -2,13 +2,27 @@
 import torch
 import torch.nn as nn
 
-from ..utils.config import MambularConfig
-from ..utils.distributions import (BetaDistribution, CategoricalDistribution,
-                                   DirichletDistribution, GammaDistribution,
-                                   InverseGammaDistribution,
-                                   NegativeBinomialDistribution,
-                                   NormalDistribution, PoissonDistribution,
-                                   StudentTDistribution)
+from ..utils.configs import DefaultMambularConfig
+from ..utils.mlp_utils import MLP
+from ..utils.distributions import (
+    BetaDistribution,
+    CategoricalDistribution,
+    DirichletDistribution,
+    GammaDistribution,
+    InverseGammaDistribution,
+    NegativeBinomialDistribution,
+    NormalDistribution,
+    PoissonDistribution,
+    StudentTDistribution,
+)
+from ..utils.normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
 from ..utils.mamba_arch import Mamba
 
 
@@ -24,71 +38,74 @@ class BaseMambularLSS(pl.LightningModule):
     family : str
         The name of the statistical distribution family to be used for modeling. Supported families include
         'normal', 'poisson', 'gamma', 'beta', 'dirichlet', 'studentt', 'negativebinom', 'inversegamma', and 'categorical'.
-    config : MambularConfig
-        An instance of MambularConfig containing configuration parameters for the model architecture.
-    cat_feature_info : dict, optional
-        A dictionary mapping the names of categorical features to their number of unique categories. Defaults to None.
-    num_feature_info : dict, optional
-        A dictionary mapping the names of numerical features to their number of dimensions after embedding. Defaults to None.
-    lr : float, optional
-        The initial learning rate for the optimizer. Defaults to 1e-03.
-    lr_patience : int, optional
-        The number of epochs with no improvement after which learning rate will be reduced. Defaults to 10.
-    weight_decay : float, optional
-        Weight decay (L2 penalty) coefficient. Defaults to 0.025.
-    lr_factor : float, optional
-        Factor by which the learning rate will be reduced. Defaults to 0.75.
-    **distribution_params : 
-        Additional parameters specific to the chosen statistical distribution family.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    config : DefaultMambularConfig, optional
+        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
+    **kwargs : dict
+        Additional keyword arguments.
 
 
     Attributes
     ----------
+    lr : float
+        Learning rate.
+    lr_patience : int
+        Patience for learning rate scheduler.
+    weight_decay : float
+        Weight decay for optimizer.
+    lr_factor : float
+        Factor by which the learning rate will be reduced.
+    pooling_method : str
+        Method to pool the features.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    embedding_activation : callable
+        Activation function for embeddings.
     mamba : Mamba
-        The core neural network module implementing the Mamba architecture.
+        Mamba architecture component.
     norm_f : nn.Module
-        Normalization layer applied after the Mamba block.
-    tabular_head : nn.Linear
-        Final linear layer mapping the features to the parameters of the chosen statistical distribution.
-    loss_fct : callable
-        The loss function derived from the chosen statistical distribution.
+        Normalization layer.
+    num_embeddings : nn.ModuleList
+        Module list for numerical feature embeddings.
+    cat_embeddings : nn.ModuleList
+        Module list for categorical feature embeddings.
+    tabular_head : MLP
+        Multi-layer perceptron head for tabular data.
+    cls_token : nn.Parameter
+        Class token parameter.
+    loss_fct : nn.Module
+        Loss function.
+    embedding_norm : nn.Module, optional
+        Layer normalization applied after embedding if specified.
     """
 
     def __init__(
         self,
         family,
-        config: MambularConfig,
-        cat_feature_info: dict = None,
-        num_feature_info: dict = None,
-        lr=1e-03,
-        lr_patience=10,
-        weight_decay=0.025,
-        lr_factor=0.75,
+        cat_feature_info,
+        num_feature_info,
+        config: DefaultMambularConfig = DefaultMambularConfig(),
         **distribution_params,
     ):
         super().__init__()
 
-        self.config = config
-        self.lr = lr
-        self.lr_patience = lr_patience
-        self.weight_decay = weight_decay
-        self.lr_factor = lr_factor
+        # Save all hyperparameters
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        # Assigning values from hyperparameters
+        self.lr = self.hparams.get("lr", config.lr)
+        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
+        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
+        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
-        activations = {
-            "relu": nn.ReLU(),
-            "tanh": nn.Tanh(),
-            "sigmoid": nn.Sigmoid(),
-            "leaky_relu": nn.LeakyReLU(),
-            "elu": nn.ELU(),
-            "selu": nn.SELU(),
-            "gelu": nn.GELU(),
-            "softplus": nn.Softplus(),
-            "leakyrelu": nn.LeakyReLU(),
-            "linear": nn.Identity(),
-        }
-
         distribution_classes = {
             "normal": NormalDistribution,
             "poisson": PoissonDistribution,
@@ -107,63 +124,106 @@ def __init__(
         else:
             raise ValueError("Unsupported family: {}".format(family))
 
-        self.embedding_activation = activations.get(
-            self.config.num_embedding_activation.lower()
-        )
+        # Set the normalization layer dynamically
+        norm_layer = self.hparams.get("norm", config.norm)
+        if norm_layer == "RMSNorm":
+            self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LayerNorm":
+            self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "BatchNorm":
+            self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "InstanceNorm":
+            self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "GroupNorm":
+            self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LearnableLayerScaling":
+            self.norm_f = LearnableLayerScaling(
+                self.hparams.get("d_model", config.d_model)
+            )
+        else:
+            raise ValueError(f"Unsupported normalization layer: {norm_layer}")
+
         if self.embedding_activation is None:
             raise ValueError(
-                f"Unsupported activation function: {self.config.num_embedding_activation}"
+                f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}"
             )
 
+        # Additional layers and components initialization based on hyperparameters
+        self.mamba = Mamba(
+            d_model=self.hparams.get("d_model", config.d_model),
+            n_layers=self.hparams.get("n_layers", config.n_layers),
+            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
+            bias=self.hparams.get("bias", config.bias),
+            d_conv=self.hparams.get("d_conv", config.d_conv),
+            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
+            dropout=self.hparams.get("dropout", config.dropout),
+            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
+            d_state=self.hparams.get("d_state", config.d_state),
+            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
+            dt_init=self.hparams.get("dt_init", config.dt_init),
+            dt_max=self.hparams.get("dt_max", config.dt_max),
+            dt_min=self.hparams.get("dt_min", config.dt_min),
+            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
+            norm=globals()[self.hparams.get("norm", config.norm)],
+            activation=self.hparams.get("activation", config.activation),
+        )
+
         self.num_embeddings = nn.ModuleList(
             [
                 nn.Sequential(
-                    nn.Linear(input_shape, self.config.d_model, bias=False),
-                    nn.BatchNorm1d(self.config.d_model),
-                    # Example using ReLU as the activation function, change as needed
+                    nn.Linear(
+                        input_shape,
+                        self.hparams.get("d_model", config.d_model),
+                        bias=False,
+                    ),
                     self.embedding_activation,
                 )
                 for feature_name, input_shape in num_feature_info.items()
             ]
         )
 
-        # Create embedding layers for categorical features based on cat_feature_info
         self.cat_embeddings = nn.ModuleList(
             [
-                nn.Embedding(num_categories + 1, self.config.d_model)
+                nn.Embedding(
+                    num_categories + 1, self.hparams.get("d_model", config.d_model)
+                )
                 for feature_name, num_categories in cat_feature_info.items()
             ]
         )
 
-        mlp_activation_fn = activations.get(
-            self.config.tabular_head_activation.lower(), nn.Identity()
-        )
+        head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        self.mamba = Mamba(self.config)
-        self.norm_f = self.config.norm(self.config.d_model)
-        mlp_layers = []
-        input_dim = self.config.d_model  # Initial input dimension
+        self.tabular_head = MLP(
+            self.hparams.get("d_model", config.d_model),
+            hidden_units_list=self.hparams.get(
+                "head_layer_sizes", config.head_layer_sizes
+            ),
+            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
+            use_skip_layers=self.hparams.get(
+                "head_skip_layers", config.head_skip_layers
+            ),
+            activation_fn=head_activation,
+            use_batch_norm=self.hparams.get(
+                "head_use_batch_norm", config.head_use_batch_norm
+            ),
+            output_units=self.family.param_count,
+        )
 
-        # Iterate over the specified units for each layer in the MLP
-        for units in self.config.tabular_head_units:
-            mlp_layers.append(nn.Linear(input_dim, units))
-            mlp_layers.append(mlp_activation_fn)
-            mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout))
-            input_dim = units
+        self.cls_token = nn.Parameter(
+            torch.zeros(1, 1, self.hparams.get("d_model", config.d_model))
+        )
 
-        # Add the final linear layer to map to #distributional param output values
-        mlp_layers.append(nn.Linear(input_dim, self.family.param_count))
+        self.loss_fct = nn.MSELoss()
 
-        # Combine all layers into a Sequential module
-        self.tabular_head = nn.Sequential(*mlp_layers)
+        if self.hparams.get("layer_norm_after_embedding"):
+            self.embedding_norm = nn.LayerNorm(
+                self.hparams.get("d_model", config.d_model)
+            )
 
         self.loss_fct = lambda predictions, y_true: self.family.compute_loss(
             predictions, y_true
         )
 
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model))
-        self.pooling_method = self.config.pooling_method
-
     def forward(self, cat_features, num_features):
         """
         Defines the forward pass of the model, processing both categorical and numerical features,
@@ -197,6 +257,8 @@ def forward(self, cat_features, num_features):
             ]
             cat_embeddings = torch.stack(cat_embeddings, dim=1)
             cat_embeddings = torch.squeeze(cat_embeddings, dim=2)
+            if self.hparams.get("layer_norm_after_embedding"):
+                cat_embeddings = self.embedding_norm(cat_embeddings)
         else:
             cat_embeddings = None
 
@@ -206,6 +268,8 @@ def forward(self, cat_features, num_features):
                 emb(num_features[i]) for i, emb in enumerate(self.num_embeddings)
             ]
             num_embeddings = torch.stack(num_embeddings, dim=1)
+            if self.hparams.get("layer_norm_after_embedding"):
+                num_embeddings = self.embedding_norm(num_embeddings)
         else:
             num_embeddings = None
 
@@ -304,7 +368,7 @@ def configure_optimizers(self):
             A dictionary containing the optimizer and lr_scheduler configurations.
         """
         optimizer = torch.optim.Adam(
-            self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay
+            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
         )
         scheduler = {
             "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(

From ca92f7f62818c0ffb70c810e1ea4d3893ae471fa Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 14:52:59 +0000
Subject: [PATCH 13/21] delete unnecessary valueerror

---
 mambular/base_models/regressor.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/mambular/base_models/regressor.py b/mambular/base_models/regressor.py
index 4171042..c8fca2c 100644
--- a/mambular/base_models/regressor.py
+++ b/mambular/base_models/regressor.py
@@ -129,11 +129,6 @@ def __init__(
         else:
             raise ValueError(f"Unsupported normalization layer: {norm_layer}")
 
-        if self.embedding_activation is None:
-            raise ValueError(
-                f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}"
-            )
-
         self.num_embeddings = nn.ModuleList(
             [
                 nn.Sequential(

From 16d72922818919fb9e6cb047de1ea332893de5bb Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 14:53:13 +0000
Subject: [PATCH 14/21] adjust distributional regression to new config

---
 mambular/base_models/distributional.py    |  31 ++--
 mambular/models/sklearn_distributional.py | 214 +++++++++++++++-------
 2 files changed, 158 insertions(+), 87 deletions(-)

diff --git a/mambular/base_models/distributional.py b/mambular/base_models/distributional.py
index 0d0db5d..bcc7ead 100644
--- a/mambular/base_models/distributional.py
+++ b/mambular/base_models/distributional.py
@@ -78,8 +78,6 @@ class BaseMambularLSS(pl.LightningModule):
         Multi-layer perceptron head for tabular data.
     cls_token : nn.Parameter
         Class token parameter.
-    loss_fct : nn.Module
-        Loss function.
     embedding_norm : nn.Module, optional
         Layer normalization applied after embedding if specified.
     """
@@ -90,7 +88,8 @@ def __init__(
         cat_feature_info,
         num_feature_info,
         config: DefaultMambularConfig = DefaultMambularConfig(),
-        **distribution_params,
+        distributional_kwargs=None,
+        **kwargs,
     ):
         super().__init__()
 
@@ -106,6 +105,10 @@ def __init__(
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
 
+        self.embedding_activation = self.hparams.get(
+            "num_embedding_activation", config.num_embedding_activation
+        )
+
         distribution_classes = {
             "normal": NormalDistribution,
             "poisson": PoissonDistribution,
@@ -118,9 +121,11 @@ def __init__(
             "categorical": CategoricalDistribution,
         }
 
+        if distributional_kwargs is None:
+            distributional_kwargs = {}
+
         if family in distribution_classes:
-            # Pass additional distribution_params to the constructor of the distribution class
-            self.family = distribution_classes[family](**distribution_params)
+            self.family = distribution_classes[family](**distributional_kwargs)
         else:
             raise ValueError("Unsupported family: {}".format(family))
 
@@ -143,11 +148,6 @@ def __init__(
         else:
             raise ValueError(f"Unsupported normalization layer: {norm_layer}")
 
-        if self.embedding_activation is None:
-            raise ValueError(
-                f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}"
-            )
-
         # Additional layers and components initialization based on hyperparameters
         self.mamba = Mamba(
             d_model=self.hparams.get("d_model", config.d_model),
@@ -206,7 +206,7 @@ def __init__(
             use_batch_norm=self.hparams.get(
                 "head_use_batch_norm", config.head_use_batch_norm
             ),
-            output_units=self.family.param_count,
+            n_output_units=self.family.param_count,
         )
 
         self.cls_token = nn.Parameter(
@@ -220,9 +220,8 @@ def __init__(
                 self.hparams.get("d_model", config.d_model)
             )
 
-        self.loss_fct = lambda predictions, y_true: self.family.compute_loss(
-            predictions, y_true
-        )
+    def compute_loss(self, predictions, y_true):
+        return self.family.compute_loss(predictions, y_true)
 
     def forward(self, cat_features, num_features):
         """
@@ -322,7 +321,7 @@ def training_step(self, batch, batch_idx):
         num_features, cat_features, labels = batch
         preds = self(num_features, cat_features)
 
-        loss = self.loss_fct(preds, labels)
+        loss = self.compute_loss(preds, labels)
         self.log(
             "train_loss",
             loss,
@@ -348,7 +347,7 @@ def validation_step(self, batch, batch_idx):
         num_features, cat_features, labels = batch
         preds = self(num_features, cat_features)
 
-        loss = self.loss_fct(preds, labels)
+        loss = self.compute_loss(preds, labels)
         self.log(
             "val_loss",
             loss,
diff --git a/mambular/models/sklearn_distributional.py b/mambular/models/sklearn_distributional.py
index 550b4b3..3fbf006 100644
--- a/mambular/models/sklearn_distributional.py
+++ b/mambular/models/sklearn_distributional.py
@@ -1,5 +1,6 @@
 import lightning as pl
 import numpy as np
+import warnings
 import pandas as pd
 import properscoring as ps
 import torch
@@ -10,12 +11,17 @@
 from torch.utils.data import DataLoader
 
 from ..base_models.distributional import BaseMambularLSS
-from ..utils.config import MambularConfig
+from ..utils.configs import DefaultMambularConfig
 from ..utils.dataset import MambularDataModule, MambularDataset
-from ..utils.distributional_metrics import (beta_brier_score, dirichlet_error,
-                                            gamma_deviance, inverse_gamma_loss,
-                                            negative_binomial_deviance,
-                                            poisson_deviance, student_t_loss)
+from ..utils.distributional_metrics import (
+    beta_brier_score,
+    dirichlet_error,
+    gamma_deviance,
+    inverse_gamma_loss,
+    negative_binomial_deviance,
+    poisson_deviance,
+    student_t_loss,
+)
 from ..utils.preprocessor import Preprocessor
 
 
@@ -27,62 +33,139 @@ class MambularLSS(BaseEstimator):
     facilitating end-to-end training and prediction workflows.
 
     The initialization of this class separates configuration arguments for the model and
-    the preprocessor, allowing for flexible adjustment of parameters.    
+    the preprocessor, allowing for flexible adjustment of parameters.
 
     Parameters
     ----------
-    **kwargs : Arbitrary keyword arguments, divided into configuration for the model and
-        preprocessing. Recognized keys include model parameters such as 'd_model',
-        'n_layers', etc., and any additional keys are assumed to be preprocessor arguments.
+    # configuration parameters
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 penalty) coefficient. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate will be reduced. Default is 0.1.
+    d_model : int, optional
+        Dimension of the model. Default is 64.
+    n_layers : int, optional
+        Number of layers. Default is 8.
+    expand_factor : int, optional
+        Expansion factor. Default is 2.
+    bias : bool, optional
+        Whether to use bias. Default is False.
+    d_conv : int, optional
+        Dimension of the convolution. Default is 16.
+    conv_bias : bool, optional
+        Whether to use bias in the convolution. Default is True.
+    dropout : float, optional
+        Dropout rate in the mamba blocks. Default is 0.05.
+    dt_rank : str, optional
+        Rank of the time dimension. Default is "auto".
+    d_state : int, optional
+        State dimension. Default is 16.
+    dt_scale : float, optional
+        Scale of the time dimension. Default is 1.0.
+    dt_init : str, optional
+        Initialization method for the time dimension. Default is "random".
+    dt_max : float, optional
+        Maximum value for the time dimension. Default is 0.1.
+    dt_min : float, optional
+        Minimum value for the time dimension. Default is 1e-3.
+    dt_init_floor : float, optional
+        Floor value for the time dimension initialization. Default is 1e-4.
+    norm : str, optional
+        Normalization method. Default is 'RMSNorm'.
+    activation : callable, optional
+        Activation function. Default is nn.SELU().
+    num_embedding_activation : callable, optional
+        Activation function for numerical embeddings. Default is nn.Identity().
+    head_layer_sizes : list, optional
+        Sizes of the layers in the head. Default is [64, 64, 32].
+    head_dropout : float, optional
+        Dropout rate for the head. Default is 0.5.
+    head_skip_layers : bool, optional
+        Whether to use skip layers in the head. Default is False.
+    head_activation : callable, optional
+        Activation function for the head. Default is nn.SELU().
+    head_use_batch_norm : bool, optional
+        Whether to use batch normalization in the head. Default is False.
+
+    # Preprocessor Parameters
+    n_bins : int, optional
+        The number of bins to use for numerical feature binning. Default is 50.
+    numerical_preprocessing : str, optional
+        The preprocessing strategy for numerical features. Default is 'ple'.
+    use_decision_tree_bins : bool, optional
+        If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False.
+    binning_strategy : str, optional
+        Defines the strategy for binning numerical features. Default is 'uniform'.
+    task : str, optional
+        Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'.
+
 
 
     Attributes
     ----------
     config : MambularConfig
-        Configuration object containing model-specific parameters.
+        Configuration object that holds model-specific settings.
     preprocessor : Preprocessor
-        Preprocessor object for data preprocessing steps.
-    model : torch.nn.Module
-        The neural network model, initialized based on 'config'.
-
-
+        Preprocessor object for handling feature preprocessing like normalization and encoding.
+    model : BaseMambularClassifier or None
+        The underlying PyTorch Lightning model, instantiated upon calling the `fit` method.
     """
 
     def __init__(self, **kwargs):
         # Known config arguments
         config_arg_names = [
+            "lr",
+            "lr_patience",
+            "weight_decay",
+            "lr_factor",
             "d_model",
             "n_layers",
-            "dt_rank",
-            "output_dimension",
-            "pooling_method",
-            "norm",
-            "cls",
-            "dt_min",
-            "dt_max",
-            "dropout",
+            "expand_factor",
             "bias",
-            "weight_decay",
+            "d_conv",
             "conv_bias",
+            "dropout",
+            "dt_rank",
             "d_state",
-            "expand_factor",
-            "d_conv",
-            "dt_init",
             "dt_scale",
+            "dt_init",
+            "dt_max",
+            "dt_min",
             "dt_init_floor",
-            "tabular_head_units",
-            "tabular_head_activation",
-            "tabular_head_dropout",
-            "num_emebedding_activation",
+            "norm",
+            "activation",
+            "num_embedding_activation",
+            "head_layer_sizes",
+            "head_dropout",
+            "head_skip_layers",
+            "head_activation",
+            "head_use_batch_norm",
+        ]
+
+        preprocessor_arg_names = [
+            "n_bins",
+            "numerical_preprocessing",
+            "use_decision_tree_bins",
+            "binning_strategy",
+            "task",
         ]
-        config_kwargs = {k: v for k,
-                         v in kwargs.items() if k in config_arg_names}
-        self.config = MambularConfig(**config_kwargs)
+        self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names}
+        self.config = DefaultMambularConfig(**self.config_kwargs)
 
-        # The rest are assumed to be preprocessor arguments
         preprocessor_kwargs = {
-            k: v for k, v in kwargs.items() if k not in config_arg_names
+            k: v for k, v in kwargs.items() if k in preprocessor_arg_names
         }
+        # Raise a warning if task is set to 'classification'
+        if preprocessor_kwargs.get("task") == "regression":
+            warnings.warn(
+                "The task in preprocessing binning is set to 'regression'. Make sure that this is correct for your distributional family ",
+                UserWarning,
+            )
+
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.model = None
 
@@ -135,8 +218,7 @@ def set_params(self, **parameters):
         """
         # Update config_kwargs with provided parameters
         valid_config_keys = self.config_kwargs.keys()
-        config_updates = {k: v for k,
-                          v in parameters.items() if k in valid_config_keys}
+        config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys}
         self.config_kwargs.update(config_updates)
 
         # Update the config object
@@ -210,8 +292,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
         MambularDataModule
             An object containing DataLoaders for training and validation datasets.
         """
-        train_preprocessed_data = self.preprocessor.fit_transform(
-            X_train, y_train)
+        train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train)
         val_preprocessed_data = self.preprocessor.transform(X_val)
 
         # Update feature info based on the actual processed data
@@ -231,26 +312,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             cat_key = "cat_" + key  # Assuming categorical keys are prefixed with 'cat_'
             if cat_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long)
                 )
             if cat_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long)
                 )
 
             binned_key = "num_" + key  # for binned features
             if binned_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
             if binned_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
         # Populate tensors for numerical features, if present in processed data
@@ -258,13 +335,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             num_key = "num_" + key  # Assuming numerical keys are prefixed with 'num_'
             if num_key in train_preprocessed_data:
                 train_num_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(train_preprocessed_data[num_key], dtype=torch.float)
                 )
             if num_key in val_preprocessed_data:
                 val_num_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(val_preprocessed_data[num_key], dtype=torch.float)
                 )
 
         train_labels = torch.tensor(y_train, dtype=torch.float)
@@ -274,8 +349,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
         train_dataset = MambularDataset(
             train_cat_tensors, train_num_tensors, train_labels
         )
-        val_dataset = MambularDataset(
-            val_cat_tensors, val_num_tensors, val_labels)
+        val_dataset = MambularDataset(val_cat_tensors, val_num_tensors, val_labels)
 
         # Create dataloaders
         train_dataloader = DataLoader(
@@ -336,21 +410,21 @@ def fit(
         X,
         y,
         family,
-        val_size=0.2,
+        val_size: float = 0.2,
         X_val=None,
         y_val=None,
-        max_epochs=100,
-        random_state=101,
-        batch_size=64,
-        shuffle=True,
-        patience=10,
-        monitor="val_loss",
-        mode="min",
-        lr=1e-3,
-        lr_patience=10,
-        factor=0.75,
-        weight_decay=0.025,
-        **trainer_kwargs,
+        max_epochs: int = 100,
+        random_state: int = 101,
+        batch_size: int = 128,
+        shuffle: bool = True,
+        patience: int = 15,
+        monitor: str = "val_loss",
+        mode: str = "min",
+        lr: float = 1e-4,
+        lr_patience: int = 10,
+        factor: float = 0.1,
+        weight_decay: float = 1e-06,
+        **trainer_kwargs
     ):
         """
         Fits the model to the provided data, using the specified loss distribution family for the prediction task.
@@ -519,8 +593,7 @@ def evaluate(self, X, y_true, metrics=None, distribution_family=None):
         """
         # Infer distribution family from model settings if not provided
         if distribution_family is None:
-            distribution_family = getattr(
-                self.model, "distribution_family", "normal")
+            distribution_family = getattr(self.model, "distribution_family", "normal")
 
         # Setup default metrics if none are provided
         if metrics is None:
@@ -558,8 +631,7 @@ def get_default_metrics(self, distribution_family):
                 "MSE": lambda y, pred: mean_squared_error(y, pred[:, 0]),
                 "CRPS": lambda y, pred: np.mean(
                     [
-                        ps.crps_gaussian(y[i], mu=pred[i, 0],
-                                         sig=np.sqrt(pred[i, 1]))
+                        ps.crps_gaussian(y[i], mu=pred[i, 0], sig=np.sqrt(pred[i, 1]))
                         for i in range(len(y))
                     ]
                 ),

From ea407dd1b30ced154036270366f90695b0b83cc0 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 14:53:27 +0000
Subject: [PATCH 15/21] adjust prepro task warning

---
 mambular/models/sklearn_classifier.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mambular/models/sklearn_classifier.py b/mambular/models/sklearn_classifier.py
index bc6626e..0e04918 100644
--- a/mambular/models/sklearn_classifier.py
+++ b/mambular/models/sklearn_classifier.py
@@ -150,9 +150,11 @@ def __init__(self, **kwargs):
         # Raise a warning if task is set to 'classification'
         if preprocessor_kwargs.get("task") == "regression":
             warnings.warn(
-                "The task is set to 'regression'. MambularClassifier is designed for classification tasks. Setting the task to classification",
+                "The task in preprocessing binning is set to 'regression'. MambularClassifier is designed for classification tasks.",
                 UserWarning,
             )
+
+        if "task" not in list(preprocessor_kwargs.keys()):
             preprocessor_kwargs["task"] = "classification"
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)

From 35a22b3d75717b16b306ad6f61ea05ba03fff1f1 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 15:16:20 +0000
Subject: [PATCH 16/21] delete valuerror warning

---
 mambular/base_models/classifier.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/mambular/base_models/classifier.py b/mambular/base_models/classifier.py
index 8c818a7..75aa29d 100644
--- a/mambular/base_models/classifier.py
+++ b/mambular/base_models/classifier.py
@@ -36,7 +36,7 @@ class BaseMambularClassifier(pl.LightningModule):
         Additional keyword arguments.
 
 
-     Attributes
+    Attributes
     ----------
     lr : float
         Learning rate.
@@ -145,11 +145,6 @@ def __init__(
         else:
             raise ValueError(f"Unsupported normalization layer: {norm_layer}")
 
-        if self.embedding_activation is None:
-            raise ValueError(
-                f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}"
-            )
-
         self.num_embeddings = nn.ModuleList(
             [
                 nn.Sequential(

From 6b2813e2ae4fedf45e31b869b6d446181e5e1fa0 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 15:30:28 +0000
Subject: [PATCH 17/21] adjust embedding regressor classes to new config

---
 mambular/base_models/embedding_regressor.py   | 216 +++++++++++-------
 .../models/sklearn_embedding_regressor.py     | 155 +++++++++----
 2 files changed, 253 insertions(+), 118 deletions(-)

diff --git a/mambular/base_models/embedding_regressor.py b/mambular/base_models/embedding_regressor.py
index d7c3b93..5d0921b 100644
--- a/mambular/base_models/embedding_regressor.py
+++ b/mambular/base_models/embedding_regressor.py
@@ -1,8 +1,16 @@
 import lightning as pl
 import torch
 import torch.nn as nn
-
-from ..utils.config import MambularConfig
+from ..utils.normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
+
+from ..utils.configs import DefaultMambularConfig
 from ..utils.mamba_arch import Mamba
 from ..utils.mlp_utils import MLP
 
@@ -15,20 +23,12 @@ class BaseEmbeddingMambularRegressor(pl.LightningModule):
 
     Parameters
     ----------
-    config : MambularConfig
-        Configuration parameters for the model architecture.
-    cat_feature_info : dict, optional
-        Information about categorical features, mapping feature names to the number of unique categories. Defaults to None.
-    num_feature_info : dict, optional
-        Information about numerical features, mapping feature names to their number of dimensions after embedding. Defaults to None.
-    lr : float, optional
-        Learning rate for the optimizer. Defaults to 1e-03.
-    lr_patience : int, optional
-        Number of epochs with no improvement after which learning rate will be reduced. Defaults to 10.
-    weight_decay : float, optional
-        Weight decay coefficient for regularization in the optimizer. Defaults to 0.025.
-    lr_factor : float, optional
-        Factor by which the learning rate will be reduced by the scheduler. Defaults to 0.75.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    config : DefaultMambularConfig, optional
+        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
     seq_size : int, optional
         Size of sequence chunks for processing numerical features. Relevant when `raw_embeddings` is False.
     raw_embeddings : bool, optional
@@ -37,66 +37,108 @@ class BaseEmbeddingMambularRegressor(pl.LightningModule):
 
     Attributes
     ----------
+    lr : float
+        Learning rate.
+    lr_patience : int
+        Patience for learning rate scheduler.
+    weight_decay : float
+        Weight decay for optimizer.
+    lr_factor : float
+        Factor by which the learning rate will be reduced.
+    pooling_method : str
+        Method to pool the features.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    embedding_activation : callable
+        Activation function for embeddings.
     mamba : Mamba
-        The core neural network module implementing the Mamba architecture.
+        Mamba architecture component.
     norm_f : nn.Module
-        Normalization layer applied after the Mamba block.
-    tabular_head : nn.Linear
-        Final linear layer mapping the features to the regression target.
-    loss_fct : nn.MSELoss
-        The loss function for regression tasks.
+        Normalization layer.
+    num_embeddings : nn.ModuleList
+        Module list for numerical feature embeddings.
+    cat_embeddings : nn.ModuleList
+        Module list for categorical feature embeddings.
+    tabular_head : MLP
+        Multi-layer perceptron head for tabular data.
+    cls_token : nn.Parameter
+        Class token parameter.
+    embedding_norm : nn.Module, optional
+        Layer normalization applied after embedding if specified.
+    loss_fct : nn.Module
+        The loss function used for training the model, MSE loss.
+
     """
 
     def __init__(
         self,
-        config: MambularConfig,
-        cat_feature_info: dict = None,
-        num_feature_info: dict = None,
-        lr=1e-03,
-        lr_patience=10,
-        weight_decay=0.025,
-        lr_factor=0.75,
+        cat_feature_info,
+        num_feature_info,
+        config: DefaultMambularConfig = DefaultMambularConfig(),
         seq_size: int = 20,
         raw_embeddings=False,
-        head_layer_sizes=[64, 32, 32],
-        head_dropout: float = 0.3,
-        head_skip_layers: bool = False,
-        head_activation="leakyrelu",
-        head_use_batch_norm: bool = False,
-        attn_dropout: float = 0.3,
+        **kwargs,
     ):
         super().__init__()
 
-        self.config = config
-        self.lr = lr
-        self.lr_patience = lr_patience
-        self.weight_decay = weight_decay
-        self.lr_factor = lr_factor
+        # Save all hyperparameters
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        # Assigning values from hyperparameters
+        self.lr = self.hparams.get("lr", config.lr)
+        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
+        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
+        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
+
+        self.embedding_activation = self.hparams.get(
+            "num_embedding_activation", config.num_embedding_activation
+        )
         self.seq_size = seq_size
         self.raw_embeddings = raw_embeddings
 
-        activations = {
-            "relu": nn.ReLU(),
-            "tanh": nn.Tanh(),
-            "sigmoid": nn.Sigmoid(),
-            "leaky_relu": nn.LeakyReLU(),
-            "elu": nn.ELU(),
-            "selu": nn.SELU(),
-            "gelu": nn.GELU(),
-            "softplus": nn.Softplus(),
-            "leakyrelu": nn.LeakyReLU(),
-            "linear": nn.Identity(),
-        }
-
-        self.embedding_activation = activations.get(
-            self.config.num_embedding_activation.lower()
+        # Additional layers and components initialization based on hyperparameters
+        self.mamba = Mamba(
+            d_model=self.hparams.get("d_model", config.d_model),
+            n_layers=self.hparams.get("n_layers", config.n_layers),
+            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
+            bias=self.hparams.get("bias", config.bias),
+            d_conv=self.hparams.get("d_conv", config.d_conv),
+            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
+            dropout=self.hparams.get("dropout", config.dropout),
+            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
+            d_state=self.hparams.get("d_state", config.d_state),
+            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
+            dt_init=self.hparams.get("dt_init", config.dt_init),
+            dt_max=self.hparams.get("dt_max", config.dt_max),
+            dt_min=self.hparams.get("dt_min", config.dt_min),
+            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
+            norm=globals()[self.hparams.get("norm", config.norm)],
+            activation=self.hparams.get("activation", config.activation),
         )
-        if self.embedding_activation is None:
-            raise ValueError(
-                f"Unsupported activation function: {self.config.num_embedding_activation}"
+
+        # Set the normalization layer dynamically
+        norm_layer = self.hparams.get("norm", config.norm)
+        if norm_layer == "RMSNorm":
+            self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LayerNorm":
+            self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "BatchNorm":
+            self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "InstanceNorm":
+            self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "GroupNorm":
+            self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LearnableLayerScaling":
+            self.norm_f = LearnableLayerScaling(
+                self.hparams.get("d_model", config.d_model)
             )
+        else:
+            raise ValueError(f"Unsupported normalization layer: {norm_layer}")
 
         if not self.raw_embeddings:
             data_size = len(num_feature_info.items())
@@ -104,7 +146,11 @@ def __init__(
             self.num_embeddings = nn.ModuleList(
                 [
                     nn.Sequential(
-                        nn.Linear(self.seq_size, self.config.d_model, bias=False),
+                        nn.Linear(
+                            self.seq_size,
+                            self.hparams.get("d_model", config.d_model),
+                            bias=False,
+                        ),
                         # Example using ReLU as the activation function, change as needed
                         self.embedding_activation,
                     )
@@ -117,43 +163,55 @@ def __init__(
             self.num_embeddings = nn.ModuleList(
                 [
                     nn.Sequential(
-                        nn.Linear(1, self.config.d_model, bias=False),
+                        nn.Linear(
+                            input_shape,
+                            self.hparams.get("d_model", config.d_model),
+                            bias=False,
+                        ),
                         # Example using ReLU as the activation function, change as needed
                         self.embedding_activation,
                     )
-                    for _ in range(num_embedding_modules)
+                    for feature_name, input_shape in num_feature_info.items()
                 ]
             )
 
         self.cat_embeddings = nn.ModuleList(
             [
-                nn.Embedding(num_categories + 1, self.config.d_model)
+                nn.Embedding(
+                    num_categories + 1, self.hparams.get("d_model", config.d_model)
+                )
                 for feature_name, num_categories in cat_feature_info.items()
             ]
         )
 
-        self.mamba = Mamba(self.config)
-        self.norm_f = self.config.norm(self.config.d_model)
-        head_activation = activations.get(head_activation.lower(), nn.Identity())
+        head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        # Combine all layers into a Sequential module
         self.tabular_head = MLP(
-            self.config.d_model,
-            hidden_units_list=head_layer_sizes,
-            dropout_rate=head_dropout,
-            use_skip_layers=head_skip_layers,
+            self.hparams.get("d_model", config.d_model),
+            hidden_units_list=self.hparams.get(
+                "head_layer_sizes", config.head_layer_sizes
+            ),
+            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
+            use_skip_layers=self.hparams.get(
+                "head_skip_layers", config.head_skip_layers
+            ),
             activation_fn=head_activation,
-            use_batch_norm=head_use_batch_norm,
+            use_batch_norm=self.hparams.get(
+                "head_use_batch_norm", config.head_use_batch_norm
+            ),
         )
 
-        self.pooling_method = self.config.pooling_method
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model))
-
-        if self.config.layer_norm_after_embedding:
-            self.embedding_norm = nn.LayerNorm(self.config.d_model)
+        self.cls_token = nn.Parameter(
+            torch.zeros(1, 1, self.hparams.get("d_model", config.d_model))
+        )
 
         self.loss_fct = nn.MSELoss()
 
+        if self.hparams.get("layer_norm_after_embedding"):
+            self.embedding_norm = nn.LayerNorm(
+                self.hparams.get("d_model", config.d_model)
+            )
+
     def forward(self, cat_features, num_features):
         """
         Defines the forward pass of the model, processing both categorical and numerical features,
@@ -219,7 +277,7 @@ def forward(self, cat_features, num_features):
                 ]
                 cat_embeddings = torch.stack(cat_embeddings, dim=1)
                 cat_embeddings = torch.squeeze(cat_embeddings, dim=2)
-                if self.config.layer_norm_after_embedding:
+                if self.hparams.get("layer_norm_after_embedding"):
                     cat_embeddings = self.embedding_norm(cat_embeddings)
             else:
                 cat_embeddings = None
@@ -230,7 +288,7 @@ def forward(self, cat_features, num_features):
                     emb(num_features[i]) for i, emb in enumerate(self.num_embeddings)
                 ]
                 num_embeddings = torch.stack(num_embeddings, dim=1)
-                if self.config.layer_norm_after_embedding:
+                if self.hparams.get("layer_norm_after_embedding"):
                     num_embeddings = self.embedding_norm(num_embeddings)
             else:
                 num_embeddings = None
@@ -331,7 +389,7 @@ def configure_optimizers(self):
             A dictionary containing the optimizer and lr_scheduler configurations.
         """
         optimizer = torch.optim.Adam(
-            self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay
+            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
         )
         scheduler = {
             "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
diff --git a/mambular/models/sklearn_embedding_regressor.py b/mambular/models/sklearn_embedding_regressor.py
index e39bf31..632e8a3 100644
--- a/mambular/models/sklearn_embedding_regressor.py
+++ b/mambular/models/sklearn_embedding_regressor.py
@@ -1,4 +1,5 @@
 import lightning as pl
+import warnings
 import pandas as pd
 import torch
 from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
@@ -9,7 +10,7 @@
 from torch.utils.data import DataLoader
 
 from ..base_models.embedding_regressor import BaseEmbeddingMambularRegressor
-from ..utils.config import MambularConfig
+from ..utils.configs import DefaultMambularConfig
 from ..utils.dataset import EmbeddingMambularDataset, MambularDataModule
 from ..utils.preprocessor import Preprocessor
 
@@ -22,8 +23,71 @@ class EmbeddingMambularRegressor(BaseEstimator):
 
     Parameters
     ----------
-    **kwargs : Keyword arguments that can include both configuration parameters for the MambularConfig and
-        parameters for the preprocessor.
+    # configuration parameters
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 penalty) coefficient. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate will be reduced. Default is 0.1.
+    d_model : int, optional
+        Dimension of the model. Default is 64.
+    n_layers : int, optional
+        Number of layers. Default is 8.
+    expand_factor : int, optional
+        Expansion factor. Default is 2.
+    bias : bool, optional
+        Whether to use bias. Default is False.
+    d_conv : int, optional
+        Dimension of the convolution. Default is 16.
+    conv_bias : bool, optional
+        Whether to use bias in the convolution. Default is True.
+    dropout : float, optional
+        Dropout rate in the mamba blocks. Default is 0.05.
+    dt_rank : str, optional
+        Rank of the time dimension. Default is "auto".
+    d_state : int, optional
+        State dimension. Default is 16.
+    dt_scale : float, optional
+        Scale of the time dimension. Default is 1.0.
+    dt_init : str, optional
+        Initialization method for the time dimension. Default is "random".
+    dt_max : float, optional
+        Maximum value for the time dimension. Default is 0.1.
+    dt_min : float, optional
+        Minimum value for the time dimension. Default is 1e-3.
+    dt_init_floor : float, optional
+        Floor value for the time dimension initialization. Default is 1e-4.
+    norm : str, optional
+        Normalization method. Default is 'RMSNorm'.
+    activation : callable, optional
+        Activation function. Default is nn.SELU().
+    num_embedding_activation : callable, optional
+        Activation function for numerical embeddings. Default is nn.Identity().
+    head_layer_sizes : list, optional
+        Sizes of the layers in the head. Default is [64, 64, 32].
+    head_dropout : float, optional
+        Dropout rate for the head. Default is 0.5.
+    head_skip_layers : bool, optional
+        Whether to use skip layers in the head. Default is False.
+    head_activation : callable, optional
+        Activation function for the head. Default is nn.SELU().
+    head_use_batch_norm : bool, optional
+        Whether to use batch normalization in the head. Default is False.
+
+    # Preprocessor Parameters
+    n_bins : int, optional
+        The number of bins to use for numerical feature binning. Default is 50.
+    numerical_preprocessing : str, optional
+        The preprocessing strategy for numerical features. Default is 'ple'.
+    use_decision_tree_bins : bool, optional
+        If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False.
+    binning_strategy : str, optional
+        Defines the strategy for binning numerical features. Default is 'uniform'.
+    task : str, optional
+        Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'.
 
 
     Attributes
@@ -32,47 +96,68 @@ class EmbeddingMambularRegressor(BaseEstimator):
         Configuration object containing model-specific parameters.
     preprocessor : Preprocessor
         Preprocessor object for data preprocessing steps.
-    model : ProteinMambularRegressor
+    model : BaseEmbeddingMambularRegressor
         The neural network model, initialized after the `fit` method is called.
     """
 
     def __init__(self, **kwargs):
         # Known config arguments
         config_arg_names = [
+            "lr",
+            "lr_patience",
+            "weight_decay",
+            "lr_factor",
             "d_model",
             "n_layers",
-            "dt_rank",
-            "output_dimension",
-            "pooling_method",
-            "norm",
-            "cls",
-            "dt_min",
-            "dt_max",
-            "dropout",
+            "expand_factor",
             "bias",
-            "weight_decay",
+            "d_conv",
             "conv_bias",
+            "dropout",
+            "dt_rank",
             "d_state",
-            "expand_factor",
-            "d_conv",
-            "dt_init",
             "dt_scale",
+            "dt_init",
+            "dt_max",
+            "dt_min",
             "dt_init_floor",
+            "norm",
+            "activation",
+            "num_embedding_activation",
+            "head_layer_sizes",
+            "head_dropout",
+            "head_skip_layers",
+            "head_activation",
+            "head_use_batch_norm",
         ]
-        config_kwargs = {k: v for k,
-                         v in kwargs.items() if k in config_arg_names}
-        self.config = MambularConfig(**config_kwargs)
 
-        # The rest are assumed to be preprocessor arguments
+        preprocessor_arg_names = [
+            "n_bins",
+            "numerical_preprocessing",
+            "use_decision_tree_bins",
+            "binning_strategy",
+            "task",
+        ]
+
+        self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names}
+        self.config = DefaultMambularConfig(**self.config_kwargs)
+
         preprocessor_kwargs = {
-            k: v for k, v in kwargs.items() if k not in config_arg_names
+            k: v for k, v in kwargs.items() if k in preprocessor_arg_names
         }
+        if "numerical_preprocessing" not in list(preprocessor_kwargs.keys()):
+            preprocessor_kwargs["numerical_preprocessing"] = "standardization"
 
-        if not "numerical_preprocessing" in preprocessor_kwargs.keys():
-            preprocessor_kwargs["numerical_preprocessing"] = "normalization"
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.model = None
 
+        # Raise a warning if task is set to 'classification'
+        if preprocessor_kwargs.get("task") == "classification":
+            warnings.warn(
+                "The task is set to 'classification'. MambularRegressor is designed for regression tasks.",
+                UserWarning,
+            )
+
     def get_params(self, deep=True):
         """
         Get parameters for this estimator. Overrides the BaseEstimator method.
@@ -117,8 +202,7 @@ def set_params(self, **parameters):
         """
         # Update config_kwargs with provided parameters
         valid_config_keys = self.config_kwargs.keys()
-        config_updates = {k: v for k,
-                          v in parameters.items() if k in valid_config_keys}
+        config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys}
         self.config_kwargs.update(config_updates)
 
         # Update the config object
@@ -189,8 +273,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
         data_module : MambularDataModule
             An instance of MambularDataModule containing the training and validation DataLoaders.
         """
-        train_preprocessed_data = self.preprocessor.fit_transform(
-            X_train, y_train)
+        train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train)
         val_preprocessed_data = self.preprocessor.transform(X_val)
 
         # Update feature info based on the actual processed data
@@ -210,26 +293,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             cat_key = "cat_" + key  # Assuming categorical keys are prefixed with 'cat_'
             if cat_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long)
                 )
             if cat_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long)
                 )
 
             binned_key = "num_" + key  # for binned features
             if binned_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
             if binned_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
         # Populate tensors for numerical features, if present in processed data
@@ -239,13 +318,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             )  # Assuming numerical keys are prefixed with 'num_'
             if num_key in train_preprocessed_data:
                 train_num_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(train_preprocessed_data[num_key], dtype=torch.float)
                 )
             if num_key in val_preprocessed_data:
                 val_num_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(val_preprocessed_data[num_key], dtype=torch.float)
                 )
 
         train_labels = torch.tensor(y_train, dtype=torch.float)

From e71e13676fe550b3483d787d1bad7b1bcb2b532a Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 15:30:39 +0000
Subject: [PATCH 18/21] adjust embedding classifiers to new config

---
 mambular/base_models/embedding_classifier.py  | 225 ++++++++++++------
 .../models/sklearn_embedding_classifier.py    | 164 +++++++++----
 2 files changed, 274 insertions(+), 115 deletions(-)

diff --git a/mambular/base_models/embedding_classifier.py b/mambular/base_models/embedding_classifier.py
index ba05649..a5c4551 100644
--- a/mambular/base_models/embedding_classifier.py
+++ b/mambular/base_models/embedding_classifier.py
@@ -3,8 +3,17 @@
 import torch.nn as nn
 import torchmetrics
 
-from ..utils.config import MambularConfig
 from ..utils.mamba_arch import Mamba
+from ..utils.mlp_utils import MLP
+from ..utils.normalization_layers import (
+    RMSNorm,
+    LayerNorm,
+    LearnableLayerScaling,
+    BatchNorm,
+    InstanceNorm,
+    GroupNorm,
+)
+from ..utils.configs import DefaultMambularConfig
 
 
 class BaseEmbeddingMambularClassifier(pl.LightningModule):
@@ -15,20 +24,14 @@ class BaseEmbeddingMambularClassifier(pl.LightningModule):
 
     Parameters
     ----------
-    config : MambularConfig
-        Configuration parameters for the model architecture.
-    cat_feature_info : dict, optional
-        Information about categorical features, mapping feature names to the number of unique categories.
-    num_feature_info : dict, optional
-        Information about numerical features, mapping feature names to their number of dimensions after embedding.
-    lr : float, optional
-        Learning rate for the optimizer. Defaults to 1e-03.
-    lr_patience : int, optional
-        Number of epochs with no improvement after which learning rate will be reduced. Defaults to 10.
-    weight_decay : float, optional
-        Weight decay coefficient for regularization in the optimizer. Defaults to 0.025.
-    lr_factor : float, optional
-        Factor by which the learning rate will be reduced by the scheduler. Defaults to 0.75.
+    num_classes : int
+        number of classes for classification.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    config : DefaultMambularConfig, optional
+        Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()).
     seq_size : int, optional
         Size of sequence chunks for processing numerical features. Relevant when `raw_embeddings` is False.
     raw_embeddings : bool, optional
@@ -37,52 +40,116 @@ class BaseEmbeddingMambularClassifier(pl.LightningModule):
 
     Attributes
     ----------
+    lr : float
+        Learning rate.
+    lr_patience : int
+        Patience for learning rate scheduler.
+    weight_decay : float
+        Weight decay for optimizer.
+    lr_factor : float
+        Factor by which the learning rate will be reduced.
+    pooling_method : str
+        Method to pool the features.
+    cat_feature_info : dict
+        Dictionary containing information about categorical features.
+    num_feature_info : dict
+        Dictionary containing information about numerical features.
+    embedding_activation : callable
+        Activation function for embeddings.
     mamba : Mamba
-        The core neural network module implementing the Mamba architecture.
+        Mamba architecture component.
     norm_f : nn.Module
-        Normalization layer applied after the Mamba block.
-    tabular_head : nn.Linear
-        Final linear layer mapping the features to the target.
+        Normalization layer.
+    num_embeddings : nn.ModuleList
+        Module list for numerical feature embeddings.
+    cat_embeddings : nn.ModuleList
+        Module list for categorical feature embeddings.
+    tabular_head : MLP
+        Multi-layer perceptron head for tabular data.
+    cls_token : nn.Parameter
+        Class token parameter.
+    embedding_norm : nn.Module, optional
+        Layer normalization applied after embedding if specified.
+    loss_fct : nn.Module
+        The loss function used for training the model, configured based on the number of classes.
+    acc : torchmetrics.Accuracy
+        A metric for computing the accuracy of predictions.
+    auroc : torchmetrics.AUROC
+        A metric for computing the Area Under the Receiver Operating Characteristic curve.
+    precision : torchmetrics.Precision
+        A metric for computing the precision of predictions.
+
     """
 
     def __init__(
         self,
         num_classes,
-        config: MambularConfig,
-        cat_feature_info: dict = None,
-        num_feature_info: dict = None,
-        lr=1e-03,
-        lr_patience=10,
-        weight_decay=0.025,
-        lr_factor=0.75,
+        cat_feature_info,
+        num_feature_info,
+        config: DefaultMambularConfig = DefaultMambularConfig(),
         seq_size: int = 20,
         raw_embeddings=False,
+        **kwargs,
     ):
         super().__init__()
 
-        self.config = config
         self.num_classes = 1 if num_classes == 2 else num_classes
-        self.lr = lr
-        self.lr_patience = lr_patience
-        self.weight_decay = weight_decay
-        self.lr_factor = lr_factor
+        # Save all hyperparameters
+        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+
+        # Assigning values from hyperparameters
+        self.lr = self.hparams.get("lr", config.lr)
+        self.lr_patience = self.hparams.get("lr_patience", config.lr_patience)
+        self.weight_decay = self.hparams.get("weight_decay", config.weight_decay)
+        self.lr_factor = self.hparams.get("lr_factor", config.lr_factor)
+        self.pooling_method = self.hparams.get("pooling_method", config.pooling_method)
         self.cat_feature_info = cat_feature_info
         self.num_feature_info = num_feature_info
+
+        self.embedding_activation = self.hparams.get(
+            "num_embedding_activation", config.num_embedding_activation
+        )
         self.seq_size = seq_size
         self.raw_embeddings = raw_embeddings
 
-        activations = {
-            "relu": nn.ReLU(),
-            "tanh": nn.Tanh(),
-            "sigmoid": nn.Sigmoid(),
-            "leaky_relu": nn.LeakyReLU(),
-            "elu": nn.ELU(),
-            "selu": nn.SELU(),
-            "gelu": nn.GELU(),
-            "softplus": nn.Softplus(),
-            "leakyrelu": nn.LeakyReLU(),
-            "linear": nn.Identity(),
-        }
+        # Additional layers and components initialization based on hyperparameters
+        self.mamba = Mamba(
+            d_model=self.hparams.get("d_model", config.d_model),
+            n_layers=self.hparams.get("n_layers", config.n_layers),
+            expand_factor=self.hparams.get("expand_factor", config.expand_factor),
+            bias=self.hparams.get("bias", config.bias),
+            d_conv=self.hparams.get("d_conv", config.d_conv),
+            conv_bias=self.hparams.get("conv_bias", config.conv_bias),
+            dropout=self.hparams.get("dropout", config.dropout),
+            dt_rank=self.hparams.get("dt_rank", config.dt_rank),
+            d_state=self.hparams.get("d_state", config.d_state),
+            dt_scale=self.hparams.get("dt_scale", config.dt_scale),
+            dt_init=self.hparams.get("dt_init", config.dt_init),
+            dt_max=self.hparams.get("dt_max", config.dt_max),
+            dt_min=self.hparams.get("dt_min", config.dt_min),
+            dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor),
+            norm=globals()[self.hparams.get("norm", config.norm)],
+            activation=self.hparams.get("activation", config.activation),
+        )
+
+        # Set the normalization layer dynamically
+        norm_layer = self.hparams.get("norm", config.norm)
+        if norm_layer == "RMSNorm":
+            self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LayerNorm":
+            self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "BatchNorm":
+            self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "InstanceNorm":
+            self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "GroupNorm":
+            self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model))
+        elif norm_layer == "LearnableLayerScaling":
+            self.norm_f = LearnableLayerScaling(
+                self.hparams.get("d_model", config.d_model)
+            )
+        else:
+            raise ValueError(f"Unsupported normalization layer: {norm_layer}")
 
         if not self.raw_embeddings:
             data_size = len(num_feature_info.items())
@@ -90,8 +157,11 @@ def __init__(
             self.num_embeddings = nn.ModuleList(
                 [
                     nn.Sequential(
-                        nn.Linear(self.seq_size,
-                                  self.config.d_model, bias=False),
+                        nn.Linear(
+                            self.seq_size,
+                            self.hparams.get("d_model", config.d_model),
+                            bias=False,
+                        ),
                         # Example using ReLU as the activation function, change as needed
                         self.embedding_activation,
                     )
@@ -104,49 +174,55 @@ def __init__(
             self.num_embeddings = nn.ModuleList(
                 [
                     nn.Sequential(
-                        nn.Linear(1, self.config.d_model, bias=False),
+                        nn.Linear(
+                            input_shape,
+                            self.hparams.get("d_model", config.d_model),
+                            bias=False,
+                        ),
                         # Example using ReLU as the activation function, change as needed
                         self.embedding_activation,
                     )
-                    for _ in range(num_embedding_modules)
+                    for feature_name, input_shape in num_feature_info.items()
                 ]
             )
 
         self.cat_embeddings = nn.ModuleList(
             [
-                nn.Embedding(num_categories + 1, self.config.d_model)
+                nn.Embedding(
+                    num_categories + 1, self.hparams.get("d_model", config.d_model)
+                )
                 for feature_name, num_categories in cat_feature_info.items()
             ]
         )
 
-        self.mamba = Mamba(self.config)
-        self.norm_f = self.config.norm(self.config.d_model)
-        mlp_activation_fn = activations.get(
-            self.config.tabular_head_activation.lower(), nn.Identity()
-        )
-
-        # Dynamically create MLP layers based on config.tabular_units
-        mlp_layers = []
-        input_dim = self.config.d_model  # Initial input dimension
-
-        # Iterate over the specified units for each layer in the MLP
-        for units in self.config.tabular_head_units:
-            mlp_layers.append(nn.Linear(input_dim, units))
-            mlp_layers.append(mlp_activation_fn)
-            mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout))
-            input_dim = units
+        head_activation = self.hparams.get("head_activation", config.head_activation)
 
-        # Add the final linear layer to map to a single output value
-        mlp_layers.append(nn.Linear(input_dim, self.num_classes))
+        self.tabular_head = MLP(
+            self.hparams.get("d_model", config.d_model),
+            hidden_units_list=self.hparams.get(
+                "head_layer_sizes", config.head_layer_sizes
+            ),
+            dropout_rate=self.hparams.get("head_dropout", config.head_dropout),
+            use_skip_layers=self.hparams.get(
+                "head_skip_layers", config.head_skip_layers
+            ),
+            activation_fn=head_activation,
+            use_batch_norm=self.hparams.get(
+                "head_use_batch_norm", config.head_use_batch_norm
+            ),
+            n_output_units=self.num_classes,
+        )
 
-        # Combine all layers into a Sequential module
-        self.tabular_head = nn.Sequential(*mlp_layers)
+        self.cls_token = nn.Parameter(
+            torch.zeros(1, 1, self.hparams.get("d_model", config.d_model))
+        )
 
-        self.pooling_method = self.config.pooling_method
-        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model))
+        self.loss_fct = nn.MSELoss()
 
-        if self.config.layer_norm_after_embedding:
-            self.embedding_norm = nn.LayerNorm(self.config.d_model)
+        if self.hparams.get("layer_norm_after_embedding"):
+            self.embedding_norm = nn.LayerNorm(
+                self.hparams.get("d_model", config.d_model)
+            )
 
         if self.num_classes > 2:
             self.loss_fct = nn.CrossEntropyLoss()
@@ -184,8 +260,7 @@ def forward(self, cat_features, num_features):
             The output predictions of the model for regression tasks.
         """
         batch_size = (
-            cat_features[0].size(0) if cat_features != [
-            ] else num_features[0].size(0)
+            cat_features[0].size(0) if cat_features != [] else num_features[0].size(0)
         )
         cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         # Process categorical features if present
@@ -398,7 +473,7 @@ def configure_optimizers(self):
             A dictionary containing the optimizer and lr_scheduler configurations.
         """
         optimizer = torch.optim.Adam(
-            self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay
+            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
         )
         scheduler = {
             "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(
diff --git a/mambular/models/sklearn_embedding_classifier.py b/mambular/models/sklearn_embedding_classifier.py
index 737549f..3dc8216 100644
--- a/mambular/models/sklearn_embedding_classifier.py
+++ b/mambular/models/sklearn_embedding_classifier.py
@@ -1,6 +1,8 @@
 import lightning as pl
 import numpy as np
 import pandas as pd
+import warnings
+
 import torch
 from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
 from sklearn.base import BaseEstimator
@@ -10,7 +12,7 @@
 from torch.utils.data import DataLoader
 
 from ..base_models.embedding_classifier import BaseEmbeddingMambularClassifier
-from ..utils.config import MambularConfig
+from ..utils.configs import DefaultMambularConfig
 from ..utils.dataset import EmbeddingMambularDataset, MambularDataModule
 from ..utils.preprocessor import Preprocessor
 
@@ -24,51 +26,141 @@ class EmbeddingMambularClassifier(BaseEstimator):
 
     Parameters
     ----------
-    **kwargs : Configuration parameters that can include both MambularConfig settings and preprocessing
-        options. Any unrecognized parameters are passed to the preprocessor.
+    # configuration parameters
+    lr : float, optional
+        Learning rate for the optimizer. Default is 1e-4.
+    lr_patience : int, optional
+        Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10.
+    weight_decay : float, optional
+        Weight decay (L2 penalty) coefficient. Default is 1e-6.
+    lr_factor : float, optional
+        Factor by which the learning rate will be reduced. Default is 0.1.
+    d_model : int, optional
+        Dimension of the model. Default is 64.
+    n_layers : int, optional
+        Number of layers. Default is 8.
+    expand_factor : int, optional
+        Expansion factor. Default is 2.
+    bias : bool, optional
+        Whether to use bias. Default is False.
+    d_conv : int, optional
+        Dimension of the convolution. Default is 16.
+    conv_bias : bool, optional
+        Whether to use bias in the convolution. Default is True.
+    dropout : float, optional
+        Dropout rate in the mamba blocks. Default is 0.05.
+    dt_rank : str, optional
+        Rank of the time dimension. Default is "auto".
+    d_state : int, optional
+        State dimension. Default is 16.
+    dt_scale : float, optional
+        Scale of the time dimension. Default is 1.0.
+    dt_init : str, optional
+        Initialization method for the time dimension. Default is "random".
+    dt_max : float, optional
+        Maximum value for the time dimension. Default is 0.1.
+    dt_min : float, optional
+        Minimum value for the time dimension. Default is 1e-3.
+    dt_init_floor : float, optional
+        Floor value for the time dimension initialization. Default is 1e-4.
+    norm : str, optional
+        Normalization method. Default is 'RMSNorm'.
+    activation : callable, optional
+        Activation function. Default is nn.SELU().
+    num_embedding_activation : callable, optional
+        Activation function for numerical embeddings. Default is nn.Identity().
+    head_layer_sizes : list, optional
+        Sizes of the layers in the head. Default is [64, 64, 32].
+    head_dropout : float, optional
+        Dropout rate for the head. Default is 0.5.
+    head_skip_layers : bool, optional
+        Whether to use skip layers in the head. Default is False.
+    head_activation : callable, optional
+        Activation function for the head. Default is nn.SELU().
+    head_use_batch_norm : bool, optional
+        Whether to use batch normalization in the head. Default is False.
+
+    # Preprocessor Parameters
+    n_bins : int, optional
+        The number of bins to use for numerical feature binning. Default is 50.
+    numerical_preprocessing : str, optional
+        The preprocessing strategy for numerical features. Default is 'ple'.
+    use_decision_tree_bins : bool, optional
+        If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False.
+    binning_strategy : str, optional
+        Defines the strategy for binning numerical features. Default is 'uniform'.
+    task : str, optional
+        Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'.
 
 
     Attributes
     ----------
     config : MambularConfig
-        Configuration object for the model, storing architecture-specific parameters.
+        Configuration object containing model-specific parameters.
     preprocessor : Preprocessor
-        Object handling data preprocessing steps such as feature encoding and normalization.
-    model : ProteinMambularClassifier
-        The underlying neural network model, instantiated during the `fit` method.
+        Preprocessor object for data preprocessing steps.
+    model : BaseEmbeddingMambularRegressor
+        The neural network model, initialized after the `fit` method is called.
     """
 
     def __init__(self, **kwargs):
         # Known config arguments
         config_arg_names = [
+            "lr",
+            "lr_patience",
+            "weight_decay",
+            "lr_factor",
             "d_model",
             "n_layers",
-            "dt_rank",
-            "output_dimension",
-            "pooling_method",
-            "norm",
-            "cls",
-            "dt_min",
-            "dt_max",
-            "dropout",
+            "expand_factor",
             "bias",
-            "weight_decay",
+            "d_conv",
             "conv_bias",
+            "dropout",
+            "dt_rank",
             "d_state",
-            "expand_factor",
-            "d_conv",
-            "dt_init",
             "dt_scale",
+            "dt_init",
+            "dt_max",
+            "dt_min",
             "dt_init_floor",
+            "norm",
+            "activation",
+            "num_embedding_activation",
+            "head_layer_sizes",
+            "head_dropout",
+            "head_skip_layers",
+            "head_activation",
+            "head_use_batch_norm",
         ]
-        config_kwargs = {k: v for k,
-                         v in kwargs.items() if k in config_arg_names}
-        self.config = MambularConfig(**config_kwargs)
 
-        # The rest are assumed to be preprocessor arguments
+        preprocessor_arg_names = [
+            "n_bins",
+            "numerical_preprocessing",
+            "use_decision_tree_bins",
+            "binning_strategy",
+            "task",
+        ]
+
+        self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names}
+        self.config = DefaultMambularConfig(**self.config_kwargs)
+
         preprocessor_kwargs = {
-            k: v for k, v in kwargs.items() if k not in config_arg_names
+            k: v for k, v in kwargs.items() if k in preprocessor_arg_names
         }
+        if "numerical_preprocessing" not in list(preprocessor_kwargs.keys()):
+            preprocessor_kwargs["numerical_preprocessing"] = "standardization"
+
+        # Raise a warning if task is set to 'classification'
+        if preprocessor_kwargs.get("task") == "regression":
+            warnings.warn(
+                "The task is set to 'regression'. This model is designed for classification tasks.",
+                UserWarning,
+            )
+
+        if "task" not in list(preprocessor_kwargs.keys()):
+            preprocessor_kwargs["task"] = "classification"
+
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.model = None
 
@@ -117,8 +209,7 @@ def set_params(self, **parameters):
         """
         # Update config_kwargs with provided parameters
         valid_config_keys = self.config_kwargs.keys()
-        config_updates = {k: v for k,
-                          v in parameters.items() if k in valid_config_keys}
+        config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys}
         self.config_kwargs.update(config_updates)
 
         # Update the config object
@@ -189,8 +280,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
         data_module : MambularDataModule
             An instance of MambularDataModule containing training and validation DataLoaders.
         """
-        train_preprocessed_data = self.preprocessor.fit_transform(
-            X_train, y_train)
+        train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train)
         val_preprocessed_data = self.preprocessor.transform(X_val)
 
         # Update feature info based on the actual processed data
@@ -210,26 +300,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             cat_key = "cat_" + key  # Assuming categorical keys are prefixed with 'cat_'
             if cat_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long)
                 )
             if cat_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[cat_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long)
                 )
 
             binned_key = "num_" + key  # for binned features
             if binned_key in train_preprocessed_data:
                 train_cat_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
             if binned_key in val_preprocessed_data:
                 val_cat_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[binned_key], dtype=torch.long)
+                    torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long)
                 )
 
         # Populate tensors for numerical features, if present in processed data
@@ -239,13 +325,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle):
             )  # Assuming numerical keys are prefixed with 'num_'
             if num_key in train_preprocessed_data:
                 train_num_tensors.append(
-                    torch.tensor(
-                        train_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(train_preprocessed_data[num_key], dtype=torch.float)
                 )
             if num_key in val_preprocessed_data:
                 val_num_tensors.append(
-                    torch.tensor(
-                        val_preprocessed_data[num_key], dtype=torch.float)
+                    torch.tensor(val_preprocessed_data[num_key], dtype=torch.float)
                 )
 
         train_labels = torch.tensor(y_train, dtype=torch.long)

From b1d33e2e4a10123ce6177c939988d6c912a356b1 Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 15:50:30 +0000
Subject: [PATCH 19/21] raise notFitted Error when calling transform on
 unfitted preprocessor

---
 mambular/utils/preprocessor.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py
index c443ed1..0da3c46 100644
--- a/mambular/utils/preprocessor.py
+++ b/mambular/utils/preprocessor.py
@@ -13,7 +13,7 @@
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from .ple_encoding import PLE
-from .ple_encoding import PLE
+from sklearn.exceptions import NotFittedError
 
 
 class Preprocessor:
@@ -277,6 +277,10 @@ def transform(self, X):
             dict: A dictionary where keys are the names of the features (as per the transformations defined in the
             column transformer) and the values are numpy arrays of the transformed data.
         """
+        if not self.fitted:
+            raise NotFittedError(
+                "The preprocessor must be fitted before transforming new data. Use .fit or .fit_transform"
+            )
         transformed_X = self.column_transformer.transform(X)
 
         # Now let's convert this into a dictionary of arrays, one per column

From 9fc9eeaad5eaec610e6276099c648aef2df5285a Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 15:50:37 +0000
Subject: [PATCH 20/21] adjust imports in tests

---
 tests/test_classifier.py    |  7 +++----
 tests/test_distributions.py |  2 --
 tests/test_lss.py           | 15 +++++++--------
 tests/test_preprocessor.py  | 10 +++++-----
 tests/test_regressor.py     | 10 ++++------
 5 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/tests/test_classifier.py b/tests/test_classifier.py
index 440a1c9..1bda0a4 100644
--- a/tests/test_classifier.py
+++ b/tests/test_classifier.py
@@ -35,9 +35,9 @@ def tearDown(self):
 
     def test_initialization(self):
         # This assumes MambularConfig is properly imported and used in the MambularRegressor class
-        from mambular.utils.config import MambularConfig
+        from mambular.utils.configs import DefaultMambularConfig
 
-        self.assertIsInstance(self.classifier.config, MambularConfig)
+        self.assertIsInstance(self.classifier.config, DefaultMambularConfig)
         self.assertEqual(self.classifier.config.d_model, 128)
         self.assertEqual(self.classifier.config.dropout, 0.1)
 
@@ -90,8 +90,7 @@ def test_evaluate(self):
             axis=1, keepdims=True
         )
         self.classifier.predict = MagicMock(return_value=mock_predictions)
-        self.classifier.predict_proba = MagicMock(
-            return_value=mock_probabilities)
+        self.classifier.predict_proba = MagicMock(return_value=mock_probabilities)
 
         # Define metrics to test
         metrics = {
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
index 21f969e..a77113f 100644
--- a/tests/test_distributions.py
+++ b/tests/test_distributions.py
@@ -47,8 +47,6 @@ def test_compute_loss_known_values(self):
             loc=predictions[:, 0], scale=torch.nn.functional.softplus(predictions[:, 1])
         )
         expected_loss = -test_dist.log_prob(torch.tensor(0.0)).mean()
-        print(loss, expected_loss)
-
         self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
 
     def test_evaluate_nll(self):
diff --git a/tests/test_lss.py b/tests/test_lss.py
index f621cdf..9da647f 100644
--- a/tests/test_lss.py
+++ b/tests/test_lss.py
@@ -4,8 +4,9 @@
 import numpy as np
 import pandas as pd
 import torch
-from properscoring import \
-    crps_gaussian  # Assuming this is the source of the CRPS function
+from properscoring import (
+    crps_gaussian,
+)  # Assuming this is the source of the CRPS function
 from sklearn.metrics import mean_poisson_deviance, mean_squared_error
 
 from mambular.models import MambularLSS  # Update the import path
@@ -40,9 +41,9 @@ def tearDown(self):
         self.patcher_base_model.stop()
 
     def test_initialization(self):
-        from mambular.utils.config import MambularConfig
+        from mambular.utils.configs import DefaultMambularConfig
 
-        self.assertIsInstance(self.model.config, MambularConfig)
+        self.assertIsInstance(self.model.config, DefaultMambularConfig)
         self.assertEqual(self.model.config.d_model, 128)
         self.assertEqual(self.model.config.dropout, 0.1)
         self.assertEqual(self.model.config.n_layers, 4)
@@ -91,8 +92,7 @@ def test_normal_metrics(self):
                 "MSE": lambda y, pred: mean_squared_error(y, pred[:, 0]),
                 "CRPS": lambda y, pred: np.mean(
                     [
-                        crps_gaussian(y[i], mu=pred[i, 0],
-                                      sig=np.sqrt(pred[i, 1]))
+                        crps_gaussian(y[i], mu=pred[i, 0], sig=np.sqrt(pred[i, 1]))
                         for i in range(len(y))
                     ]
                 ),
@@ -124,8 +124,7 @@ def test_poisson_metrics(self):
         )
         self.assertIn("Poisson Deviance", results)
         # Optionally calculate expected deviance and check
-        expected_deviance = mean_poisson_deviance(
-            self.y_test, mock_predictions)
+        expected_deviance = mean_poisson_deviance(self.y_test, mock_predictions)
         self.assertAlmostEqual(results["Poisson Deviance"], expected_deviance)
 
 
diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py
index fb43c64..7cf4f84 100644
--- a/tests/test_preprocessor.py
+++ b/tests/test_preprocessor.py
@@ -20,7 +20,7 @@ def setUp(self):
 
     def test_initialization(self):
         """Test initialization of the Preprocessor with default parameters."""
-        pp = Preprocessor(n_bins=20)
+        pp = Preprocessor(n_bins=20, numerical_preprocessing="binning")
         self.assertEqual(pp.n_bins, 20)
         self.assertEqual(pp.numerical_preprocessing, "binning")
         self.assertFalse(pp.use_decision_tree_bins)
@@ -28,7 +28,7 @@ def test_initialization(self):
     def test_fit(self):
         """Test the fitting process of the preprocessor."""
         pp = Preprocessor(numerical_preprocessing="binning", n_bins=20)
-        pp.fit(self.data)
+        pp.fit(self.data, self.target)
         self.assertIsNotNone(pp.column_transformer)
 
     def test_transform_not_fitted(self):
@@ -40,7 +40,7 @@ def test_transform_not_fitted(self):
     def test_fit_transform(self):
         """Test fitting and transforming the data."""
         pp = Preprocessor(numerical_preprocessing="standardization")
-        transformed_data = pp.fit_transform(self.data)
+        transformed_data = pp.fit_transform(self.data, self.target)
         self.assertIsInstance(transformed_data, dict)
         self.assertTrue("num_numerical" in transformed_data)
         self.assertTrue("cat_categorical" in transformed_data)
@@ -48,7 +48,7 @@ def test_fit_transform(self):
     def test_ple(self):
         """Test fitting and transforming the data."""
         pp = Preprocessor(numerical_preprocessing="ple", n_bins=20)
-        transformed_data = pp.fit_transform(self.data)
+        transformed_data = pp.fit_transform(self.data, self.target)
         self.assertIsInstance(transformed_data, dict)
         self.assertTrue("num_numerical" in transformed_data)
         self.assertTrue("cat_categorical" in transformed_data)
@@ -59,7 +59,7 @@ def test_transform_with_missing_values(self):
         data_with_missing.loc[0, "numerical"] = np.nan
         data_with_missing.loc[1, "categorical"] = np.nan
         pp = Preprocessor(numerical_preprocessing="normalization")
-        transformed_data = pp.fit_transform(data_with_missing)
+        transformed_data = pp.fit_transform(data_with_missing, self.target)
         self.assertNotIn(np.nan, transformed_data["num_numerical"])
         self.assertNotIn(np.nan, transformed_data["cat_categorical"])
 
diff --git a/tests/test_regressor.py b/tests/test_regressor.py
index ab405b3..8daff5d 100644
--- a/tests/test_regressor.py
+++ b/tests/test_regressor.py
@@ -34,9 +34,9 @@ def tearDown(self):
 
     def test_initialization(self):
         # This assumes MambularConfig is properly imported and used in the MambularRegressor class
-        from mambular.utils.config import MambularConfig
+        from mambular.utils.configs import DefaultMambularConfig
 
-        self.assertIsInstance(self.regressor.config, MambularConfig)
+        self.assertIsInstance(self.regressor.config, DefaultMambularConfig)
         self.assertEqual(self.regressor.config.d_model, 128)
         self.assertEqual(self.regressor.config.dropout, 0.1)
 
@@ -65,8 +65,7 @@ def test_predict(self):
         # Create mock return objects that mimic tensor behavior
         mock_prediction = MagicMock()
         mock_prediction.cpu.return_value = MagicMock()
-        mock_prediction.cpu.return_value.numpy.return_value = np.array([
-                                                                       0.5] * 100)
+        mock_prediction.cpu.return_value.numpy.return_value = np.array([0.5] * 100)
 
         # Mock the model and its method calls
         self.regressor.model = MagicMock()
@@ -87,8 +86,7 @@ def test_evaluate(self):
         self.regressor.predict = MagicMock(return_value=mock_predictions)
 
         # Define metrics to test
-        metrics = {"Mean Squared Error": mean_squared_error,
-                   "R2 Score": r2_score}
+        metrics = {"Mean Squared Error": mean_squared_error, "R2 Score": r2_score}
 
         # Call evaluate with the defined metrics
         result = self.regressor.evaluate(self.X, self.y, metrics=metrics)

From bbf07c93519e3ef087690910834249ee183c1d6c Mon Sep 17 00:00:00 2001
From: thielmaf <anton.thielmann@basf.com>
Date: Wed, 29 May 2024 15:51:35 +0000
Subject: [PATCH 21/21] delete old config

---
 mambular/utils/config.py | 150 ---------------------------------------
 1 file changed, 150 deletions(-)
 delete mode 100644 mambular/utils/config.py

diff --git a/mambular/utils/config.py b/mambular/utils/config.py
deleted file mode 100644
index 3757d5b..0000000
--- a/mambular/utils/config.py
+++ /dev/null
@@ -1,150 +0,0 @@
-from dataclasses import dataclass, asdict, field
-import json
-import os
-import math
-from typing import Union, Type, List
-from .normalization_layers import (
-    RMSNorm,
-    LayerNorm,
-    LearnableLayerScaling,
-    BatchNorm,
-    InstanceNorm,
-    GroupNorm,
-)
-import torch.nn as nn
-
-
-@dataclass
-class MambularConfig:
-    """
-    A configuration class specific to the Mambular model.
-    Handles Mamba-specific hyperparameters as well as vocabulary size and output dimensions.
-
-    Attributes:
-        d_model (int): The dimensionality of the input and output tensors.
-        n_layers (int): The number of MambaBlocks in the model.
-        dt_rank (Union[int, str]): The rank of the dynamical time tensor.
-            Can be an integer or 'auto' to calculate automatically based on d_model.
-        d_state (int): The dimensionality of the state tensor.
-        expand_factor (int): The factor by which the inner dimensionality is expanded.
-        d_conv (int): The dimensionality of the convolutional layer.
-
-        dt_min (float): The minimum value for dynamical time.
-        dt_max (float): The maximum value for dynamical time.
-        dt_init (str): The initialization method for dynamical time. Either 'constant' or 'random'.
-        dt_scale (float): The scale factor for dynamical time initialization.
-        dt_init_floor (float): The floor value for dynamical time initialization.
-
-        dropout (float): The dropout probability.
-        bias (bool): Whether to include bias in linear layers.
-        weight_decay (float): weight decay in optimizer.
-        conv_bias (bool): Whether to include bias in the convolutional layer.
-        vocab_size (list): The sizes of the vocabulary for the features used by the Mambular model.
-        output_dimension (int): The dimensionality of the output layer.
-        pooling_method (str): The pooling method for combining token embeddings.
-            Options: 'avg', 'max', 'sum', 'cls_token'.
-        norm (nn.Module): The normalization layer to use.
-            Options: RMSNorm, LayerNorm, LearnableLayerScaling, BatchNorm, InstanceNorm, GroupNorm.
-
-    Methods:
-        __post_init__(): Performs additional initialization steps or checks after instance creation.
-        save_pretrained(save_directory: str): Saves the configuration to a JSON file.
-
-    Raises:
-        ValueError: If invalid values are provided for pooling method or normalization layer.
-    """
-
-    VALID_POOLING_METHODS = ["avg", "max", "sum", "cls_token"]
-
-    VALID_NORMALIZATION_LAYERS = {
-        "RMSNorm": RMSNorm,
-        "LayerNorm": LayerNorm,
-        "LearnableLayerScaling": LearnableLayerScaling,
-        "BatchNorm": BatchNorm,
-        "InstanceNorm": InstanceNorm,
-        "GroupNorm": GroupNorm,
-    }
-
-    d_model: int = 64
-    n_layers: int = 6
-    dt_rank: Union[int, str] = "auto"
-    d_state: int = 32
-    expand_factor: int = 2
-    d_conv: int = 8
-
-    dt_min: float = 0.001
-    dt_max: float = 0.1
-    dt_init: str = "random"
-    dt_scale: float = 1.0
-    dt_init_floor: float = 1e-4
-    dropout: float = 0.05
-
-    bias: bool = False
-    weight_decay: float = 0.025
-    conv_bias: bool = True
-    output_dimension: int = 1
-    pooling_method: str = "avg"
-    norm: Union[str, Type[nn.Module]] = RMSNorm
-    num_embedding_activation: str = "linear"
-    tabular_head_units: list = field(default_factory=lambda: [128, 64, 64])
-    tabular_head_activation: str = "relu"
-    tabular_head_dropout: float = 0.3
-    layer_norm_after_embedding: bool = True
-
-    def __post_init__(self):
-        """
-        Called automatically after the initialization of MambularConfig instances.
-        Performs additional initialization steps or checks, if required.
-        """
-        self.d_inner = self.expand_factor * self.d_model
-
-        if self.dt_rank == "auto":
-            self.dt_rank = math.ceil(self.d_model / 16)
-
-        # Check if the provided pooling method is valid
-        if self.pooling_method not in self.VALID_POOLING_METHODS:
-            raise ValueError(
-                f"Invalid pooling method: {self.pooling_method}. "
-                f"Valid options are: {', '.join(self.VALID_POOLING_METHODS)}"
-            )
-
-        # Check if the provided normalization layer is valid
-        if (
-            isinstance(self.norm, type)
-            and self.norm.__name__ not in self.VALID_NORMALIZATION_LAYERS
-        ):
-            raise ValueError(
-                f"Invalid normalization layer: {self.norm.__name__}. "
-                f"Valid options are: {', '.join(self.VALID_NORMALIZATION_LAYERS.keys())}"
-            )
-        elif (
-            isinstance(self.norm, str)
-            and self.norm not in self.VALID_NORMALIZATION_LAYERS
-        ):
-            raise ValueError(
-                f"Invalid normalization layer: {self.norm}. "
-                f"Valid options are: {', '.join(self.VALID_NORMALIZATION_LAYERS.keys())}"
-            )
-
-    def save_pretrained(self, save_directory: str):
-        """
-        Saves the configuration parameters of the MambularConfig instance to a JSON file
-        in the specified directory. This is useful for model persistence, reproducibility,
-        or reloading the model configuration in the future.
-
-        Parameters:
-            save_directory (str): The directory path where the configuration JSON file will be saved.
-
-        Returns:
-            None: The method prints the path where the configuration is saved but does not return any value.
-        """
-        os.makedirs(save_directory, exist_ok=True)
-        # Define the configuration file path
-        config_file = os.path.join(save_directory, "config.json")
-
-        # Convert the dataclass to a dictionary and then to a JSON string
-        config_dict = asdict(self)
-        with open(config_file, "w") as f:
-            json.dump(config_dict, f, indent=4)
-
-        print(f"Configuration saved in {config_file}")