From 0b222a4e70e21a5ebff8a07d68ec674effd77a5f Mon Sep 17 00:00:00 2001 From: thielmaf Date: Tue, 28 May 2024 07:29:03 +0000 Subject: [PATCH 01/21] test new preprocessing (ple) --- mambular/utils/mlp_utils.py | 245 +++++++++++++++++++++++++++ mambular/utils/ple_encoding.py | 156 ++++++++++++++++++ mambular/utils/prepro_utils.py | 167 +++++++++++++++++++ mambular/utils/preprocessor.py | 292 +++++++++------------------------ setup.py | 2 +- tests/test_preprocessor.py | 9 +- 6 files changed, 658 insertions(+), 213 deletions(-) create mode 100644 mambular/utils/mlp_utils.py create mode 100644 mambular/utils/ple_encoding.py create mode 100644 mambular/utils/prepro_utils.py diff --git a/mambular/utils/mlp_utils.py b/mambular/utils/mlp_utils.py new file mode 100644 index 0000000..78dc209 --- /dev/null +++ b/mambular/utils/mlp_utils.py @@ -0,0 +1,245 @@ +import torch +import torch.nn as nn + + +class Linear_skip_block(nn.Module): + """ + A neural network block that includes a linear layer, an activation function, a dropout layer, and optionally a + skip connection and batch normalization. The skip connection is added if the input and output feature sizes are equal. + + Parameters + ---------- + n_input : int + The number of input features. + n_output : int + The number of output features. + dropout_rate : float + The rate of dropout to apply for regularization. + activation_fn : torch.nn.modules.activation, optional + The activation function to use after the linear layer. Default is nn.LeakyReLU(). + use_batch_norm : bool, optional + Whether to apply batch normalization after the activation function. Default is False. + + Attributes + ---------- + fc : torch.nn.Linear + The linear transformation layer. + act : torch.nn.Module + The activation function. + drop : torch.nn.Dropout + The dropout layer. + use_batch_norm : bool + Indicator of whether batch normalization is used. + batch_norm : torch.nn.BatchNorm1d, optional + The batch normalization layer, instantiated if use_batch_norm is True. + use_skip : bool + Indicator of whether a skip connection is used. + """ + + def __init__( + self, + n_input, + n_output, + dropout_rate, + activation_fn=nn.LeakyReLU(), + use_batch_norm=False, + ): + super(Linear_skip_block, self).__init__() + + self.fc = nn.Linear(n_input, n_output) + self.act = activation_fn + self.drop = nn.Dropout(dropout_rate) + self.use_batch_norm = use_batch_norm + self.use_skip = ( + n_input == n_output + ) # Only use skip connection if input and output sizes are equal + + if use_batch_norm: + self.batch_norm = nn.BatchNorm1d(n_output) # Initialize batch normalization + + def forward(self, x): + """ + Defines the forward pass of the Linear_block. + + Parameters + ---------- + x : Tensor + The input tensor to the block. + + Returns + ------- + Tensor + The output tensor after processing through the linear layer, activation function, dropout, + and optional batch normalization. + """ + x0 = x # Save input for possible skip connection + x = self.fc(x) + x = self.act(x) + + if self.use_batch_norm: + x = self.batch_norm(x) # Apply batch normalization after activation + + if self.use_skip: + x = x + x0 # Add skip connection if applicable + + x = self.drop(x) # Apply dropout + return x + + +class Linear_block(nn.Module): + """ + A neural network block that includes a linear layer, an activation function, a dropout layer, and optionally batch normalization. + + Parameters + ---------- + n_input : int + The number of input features. + n_output : int + The number of output features. + dropout_rate : float + The rate of dropout to apply. + activation_fn : torch.nn.modules.activation, optional + The activation function to use after the linear layer. Default is nn.LeakyReLU(). + batch_norm : bool, optional + Whether to include batch normalization after the activation function. Default is False. + + Attributes + ---------- + block : torch.nn.Sequential + A sequential container holding the linear layer, activation function, dropout, and optionally batch normalization. + """ + + def __init__( + self, + n_input, + n_output, + dropout_rate, + activation_fn=nn.LeakyReLU(), + batch_norm=False, + ): + super(Linear_block, self).__init__() + + # Initialize modules + modules = [ + nn.Linear(n_input, n_output), + activation_fn, + nn.Dropout(dropout_rate), + ] + + # Optionally add batch normalization + if batch_norm: + modules.append(nn.BatchNorm1d(n_output)) + + # Create the sequential model + self.block = nn.Sequential(*modules) + + def forward(self, x): + """ + Defines the forward pass of the Linear_block. + + Parameters + ---------- + x : Tensor + The input tensor to the block. + + Returns + ------- + Tensor + The output tensor after processing through the linear layer, activation function, dropout, + and optional batch normalization. + """ + # Pass the input through the block + return self.block(x) + + +class MLP(nn.Module): + """ + A multi-layer perceptron (MLP) for regression tasks, configurable with optional skip connections and batch normalization. + + Parameters + ---------- + n_input_units : int + The number of units in the input layer. + hidden_units_list : list of int + A list specifying the number of units in each hidden layer. + n_output_units : int + The number of units in the output layer. + dropout_rate : float + The dropout rate used across the MLP. + use_skip_layers : bool, optional + Whether to use skip connections in layers where input and output sizes match. Default is False. + activation_fn : torch.nn.modules.activation, optional + The activation function used across the layers. Default is nn.LeakyReLU(). + use_batch_norm : bool, optional + Whether to apply batch normalization in each layer. Default is False. + + Attributes + ---------- + hidden_layers : torch.nn.Sequential + Sequential container of layers comprising the MLP's hidden layers. + linear_final : torch.nn.Linear + The final linear layer of the MLP. + """ + + def __init__( + self, + n_input_units, + hidden_units_list=[64, 32, 32], + n_output_units: int = 1, + dropout_rate: float = 0.1, + use_skip_layers: bool = False, + activation_fn=nn.LeakyReLU(), + use_batch_norm: bool = False, + ): + super(MLP, self).__init__() + self.n_input_units = n_input_units + self.hidden_units_list = hidden_units_list + self.dropout_rate = dropout_rate + self.n_output_units = n_output_units + + layers = [] + input_units = n_input_units + + for n_hidden_units in hidden_units_list: + if use_skip_layers and input_units == n_hidden_units: + layers.append( + Linear_skip_block( + input_units, + n_hidden_units, + dropout_rate, + activation_fn, + use_batch_norm, + ) + ) + else: + layers.append( + Linear_block( + input_units, + n_hidden_units, + dropout_rate, + activation_fn, + use_batch_norm, + ) + ) + input_units = n_hidden_units # Update input_units for the next layer + + self.hidden_layers = nn.Sequential(*layers) + self.linear_final = nn.Linear(input_units, n_output_units) # Final layer + + def forward(self, x): + """ + Defines the forward pass of the MLP. + + Parameters + ---------- + x : Tensor + The input tensor to the MLP. + + Returns + ------- + Tensor + The output predictions of the model for regression tasks. + """ + x = self.hidden_layers(x) + x = self.linear_final(x) + return x diff --git a/mambular/utils/ple_encoding.py b/mambular/utils/ple_encoding.py new file mode 100644 index 0000000..972b36b --- /dev/null +++ b/mambular/utils/ple_encoding.py @@ -0,0 +1,156 @@ +import numpy as np +from tqdm import tqdm +import pandas as pd +import bisect +import re +from sklearn.tree import _tree +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +import pandas as pd +import numpy as np +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier + + +def tree_to_code(tree, feature_names): + """ + Convert a scikit-learn decision tree into a list of conditions. + + Args: + tree (sklearn.tree.DecisionTreeRegressor or sklearn.tree.DecisionTreeClassifier): + The decision tree model to be converted. + feature_names (list of str): The names of the features used in the tree. + Y (array-like): The target values associated with the tree. + + Returns: + list of str: A list of conditions representing the decision tree paths. + + Example: + # Convert a decision tree into a list of conditions + tree_conditions = tree_to_code(tree_model, feature_names, target_values) + """ + + tree_ = tree.tree_ + feature_name = [ + feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" + for i in tree_.feature + ] + + pathto = dict() + my_list = [] + + global k + k = 0 + + def recurse(node, depth, parent): + global k + indent = " " * depth + + if tree_.feature[node] != _tree.TREE_UNDEFINED: + # name = df_name + "[" + "'" + feature_name[node]+ "'" + "]" + name = feature_name[node] + threshold = tree_.threshold[node] + s = "{} <= {} ".format(name, threshold, node) + if node == 0: + pathto[node] = "(" + s + ")" + else: + pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")" + + recurse(tree_.children_left[node], depth + 1, node) + s = "{} > {}".format(name, threshold) + if node == 0: + pathto[node] = s + else: + pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")" + recurse(tree_.children_right[node], depth + 1, node) + else: + k = k + 1 + my_list.append(pathto[parent]) + # print(k,')',pathto[parent], tree_.value[node]) + + recurse(0, 1, 0) + + return my_list + + +class PLE(BaseEstimator, TransformerMixin): + def __init__( + self, n_bins=20, tree_params={}, task="regression", conditions=None, **kwargs + ): + super(PLE, self).__init__(**kwargs) + + self.task = task + self.tree_params = tree_params + self.n_bins = n_bins + self.conditions = conditions + self.pattern = ( + r"-?\d+\.?\d*[eE]?[+-]?\d*" # This pattern matches integers and floats + ) + + def fit(self, feature, target): + if self.task == "regression": + dt = DecisionTreeRegressor(max_leaf_nodes=self.n_bins) + elif self.task == "classification": + dt = DecisionTreeClassifier(max_leaf_nodes=self.n_bins) + else: + raise ValueError("This task is not supported") + + dt.fit(feature, target) + + self.conditions = tree_to_code(dt, ["feature"]) + return self + + def transform(self, feature): + if feature.shape == (feature.shape[0], 1): + feature = np.squeeze(feature, axis=1) + else: + feature = feature + result_list = [] + for idx, cond in enumerate(self.conditions): + result_list.append(eval(cond) * (idx + 1)) + + encoded_feature = np.expand_dims(np.sum(np.stack(result_list).T, axis=1), 1) + + encoded_feature = np.array(encoded_feature - 1, dtype=np.int64) + + # Initialize an empty list to store the extracted numbers + locations = [] + # Iterate through the strings and extract numbers + for string in self.conditions: + matches = re.findall(self.pattern, string) + locations.extend(matches) + + locations = [float(number) for number in locations] + locations = list(set(locations)) + locations = np.sort(locations) + + ple_encoded_feature = np.zeros((len(feature), locations.shape[0] + 1)) + if locations[-1] > np.max(feature): + locations[-1] = np.max(feature) + + for idx in range(len(encoded_feature)): + if feature[idx] >= locations[-1]: + ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx] + ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1 + elif feature[idx] <= locations[0]: + ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx] + + else: + ple_encoded_feature[idx][encoded_feature[idx]] = ( + feature[idx] - locations[(encoded_feature[idx] - 1)[0]] + ) / ( + locations[(encoded_feature[idx])[0]] + - locations[(encoded_feature[idx] - 1)[0]] + ) + + ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1 + + if ple_encoded_feature.shape[1] == 1: + return np.zeros([len(feature), self.n_bins]) + + else: + return np.array(ple_encoded_feature, dtype=np.float32) + + def get_feature_names_out(self, input_features=None): + if input_features is None: + raise ValueError("input_features must be specified") + return input_features diff --git a/mambular/utils/prepro_utils.py b/mambular/utils/prepro_utils.py new file mode 100644 index 0000000..4bb9fa7 --- /dev/null +++ b/mambular/utils/prepro_utils.py @@ -0,0 +1,167 @@ +import pandas as pd +import numpy as np +from sklearn.base import TransformerMixin, BaseEstimator + + +class CustomBinner(TransformerMixin): + def __init__(self, bins): + # bins can be a scalar (number of bins) or array-like (bin edges) + self.bins = bins + + def fit(self, X, y=None): + # Fit doesn't need to do anything as we are directly using provided bins + return self + + def transform(self, X): + if isinstance(self.bins, int): + # Calculate equal width bins based on the range of the data and number of bins + _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True) + else: + # Use predefined bins + bins = self.bins + + # Apply the bins to the data + binned_data = pd.cut( + X.squeeze(), + bins=np.sort(np.unique(bins)), + labels=False, + include_lowest=True, + ) + print(binned_data) + return np.expand_dims(np.array(binned_data), 1) + + +class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin): + """ + This encoder converts categorical features into continuous integer values. Each unique category within a feature + is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for + models that can only handle continuous data. + + Attributes: + mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers + for a single feature. + + Methods: + fit(X, y=None): Learns the mapping from original categories to integers. + transform(X): Applies the learned mapping to the data. + get_feature_names_out(input_features=None): Returns the input features after transformation. + """ + + def fit(self, X, y=None): + """ + Learns the mapping from original categories to integers for each feature. + + Parameters: + X (array-like of shape (n_samples, n_features)): The input data to fit. + y (ignored): Not used, present for API consistency by convention. + + Returns: + self: Returns the instance itself. + """ + # Fit should determine the mapping from original categories to sequential integers starting from 0 + self.mapping_ = [ + {category: i for i, category in enumerate(np.unique(col))} for col in X.T + ] + return self + + def transform(self, X): + """ + Transforms the categories in X to their corresponding integer values based on the learned mapping. + + Parameters: + X (array-like of shape (n_samples, n_features)): The input data to transform. + + Returns: + X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values. + """ + # Transform the categories to their mapped integer values + X_transformed = np.array( + [ + [self.mapping_[col].get(value, -1) for col, value in enumerate(row)] + for row in X + ] + ) + return X_transformed + + def get_feature_names_out(self, input_features=None): + """ + Returns the names of the transformed features. + + Parameters: + input_features (list of str): The names of the input features. + + Returns: + input_features (array of shape (n_features,)): The names of the output features after transformation. + """ + if input_features is None: + raise ValueError("input_features must be specified") + return input_features + + +class OneHotFromOrdinal(TransformerMixin, BaseEstimator): + """ + A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful + in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required + for model training. + + Attributes: + max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature, + determining the size of the one-hot encoded array for that feature. + + Methods: + fit(X, y=None): Learns the maximum bin index for each feature. + transform(X): Converts ordinal-encoded features into one-hot format. + get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding. + """ + + def fit(self, X, y=None): + """ + Learns the maximum bin index for each feature from the data. + + Parameters: + X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features. + y (ignored): Not used, present for API consistency by convention. + + Returns: + self: Returns the instance itself. + """ + self.max_bins_ = ( + np.max(X, axis=0).astype(int) + 1 + ) # Find the maximum bin index for each feature + return self + + def transform(self, X): + """ + Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting. + + Parameters: + X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features. + + Returns: + X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features. + """ + # Initialize an empty list to hold the one-hot encoded arrays + one_hot_encoded = [] + for i, max_bins in enumerate(self.max_bins_): + # Convert each feature to one-hot using its max_bins + feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)] + one_hot_encoded.append(feature_one_hot) + # Concatenate the one-hot encoded features horizontally + return np.hstack(one_hot_encoded) + + def get_feature_names_out(self, input_features=None): + """ + Generates feature names for the one-hot encoded features based on the input feature names and the number of bins. + + Parameters: + input_features (list of str): The names of the input features that were ordinal-encoded. + + Returns: + feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features. + """ + feature_names = [] + for i, max_bins in enumerate(self.max_bins_): + feature_names.extend( + [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))] + ) + return np.array(feature_names) diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py index 7861ba0..082bbec 100644 --- a/mambular/utils/preprocessor.py +++ b/mambular/utils/preprocessor.py @@ -1,6 +1,6 @@ import pandas as pd import numpy as np -from sklearn.base import TransformerMixin, BaseEstimator +from .prepro_utils import OneHotFromOrdinal, CustomBinner, ContinuousOrdinalEncoder from sklearn.preprocessing import ( StandardScaler, KBinsDiscretizer, @@ -10,171 +10,7 @@ from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer -from sklearn.exceptions import NotFittedError - - -class CustomBinner(TransformerMixin): - def __init__(self, bins): - # bins can be a scalar (number of bins) or array-like (bin edges) - self.bins = bins - - def fit(self, X, y=None): - # Fit doesn't need to do anything as we are directly using provided bins - return self - - def transform(self, X): - if isinstance(self.bins, int): - # Calculate equal width bins based on the range of the data and number of bins - _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True) - else: - # Use predefined bins - bins = self.bins - - # Apply the bins to the data - binned_data = pd.cut( - X.squeeze(), - bins=np.sort(np.unique(bins)), - labels=False, - include_lowest=True, - ) - print(binned_data) - return np.expand_dims(np.array(binned_data), 1) - - -class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin): - """ - This encoder converts categorical features into continuous integer values. Each unique category within a feature - is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for - models that can only handle continuous data. - - Attributes: - mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers - for a single feature. - - Methods: - fit(X, y=None): Learns the mapping from original categories to integers. - transform(X): Applies the learned mapping to the data. - get_feature_names_out(input_features=None): Returns the input features after transformation. - """ - - def fit(self, X, y=None): - """ - Learns the mapping from original categories to integers for each feature. - - Parameters: - X (array-like of shape (n_samples, n_features)): The input data to fit. - y (ignored): Not used, present for API consistency by convention. - - Returns: - self: Returns the instance itself. - """ - # Fit should determine the mapping from original categories to sequential integers starting from 0 - self.mapping_ = [ - {category: i for i, category in enumerate(np.unique(col))} for col in X.T - ] - return self - - def transform(self, X): - """ - Transforms the categories in X to their corresponding integer values based on the learned mapping. - - Parameters: - X (array-like of shape (n_samples, n_features)): The input data to transform. - - Returns: - X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values. - """ - # Transform the categories to their mapped integer values - X_transformed = np.array( - [ - [self.mapping_[col].get(value, -1) for col, value in enumerate(row)] - for row in X - ] - ) - return X_transformed - - def get_feature_names_out(self, input_features=None): - """ - Returns the names of the transformed features. - - Parameters: - input_features (list of str): The names of the input features. - - Returns: - input_features (array of shape (n_features,)): The names of the output features after transformation. - """ - if input_features is None: - raise ValueError("input_features must be specified") - return input_features - - -class OneHotFromOrdinal(TransformerMixin, BaseEstimator): - """ - A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful - in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required - for model training. - - Attributes: - max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature, - determining the size of the one-hot encoded array for that feature. - - Methods: - fit(X, y=None): Learns the maximum bin index for each feature. - transform(X): Converts ordinal-encoded features into one-hot format. - get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding. - """ - - def fit(self, X, y=None): - """ - Learns the maximum bin index for each feature from the data. - - Parameters: - X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features. - y (ignored): Not used, present for API consistency by convention. - - Returns: - self: Returns the instance itself. - """ - self.max_bins_ = ( - np.max(X, axis=0).astype(int) + 1 - ) # Find the maximum bin index for each feature - return self - - def transform(self, X): - """ - Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting. - - Parameters: - X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features. - - Returns: - X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features. - """ - # Initialize an empty list to hold the one-hot encoded arrays - one_hot_encoded = [] - for i, max_bins in enumerate(self.max_bins_): - # Convert each feature to one-hot using its max_bins - feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)] - one_hot_encoded.append(feature_one_hot) - # Concatenate the one-hot encoded features horizontally - return np.hstack(one_hot_encoded) - - def get_feature_names_out(self, input_features=None): - """ - Generates feature names for the one-hot encoded features based on the input feature names and the number of bins. - - Parameters: - input_features (list of str): The names of the input features that were ordinal-encoded. - - Returns: - feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features. - """ - feature_names = [] - for i, max_bins in enumerate(self.max_bins_): - feature_names.extend( - [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))] - ) - return np.array(feature_names) +from .ple_encoding import PLE class Preprocessor: @@ -195,10 +31,15 @@ class Preprocessor: use_decision_tree_bins (bool): If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. This parameter is relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'. + binning_strategy (str): Defines the strategy for binning numerical features. Options include 'uniform', + 'quantile', or other sklearn-compatible strategies. + task (str): Indicates the type of machine learning task ('regression' or 'classification'). This can + influence certain preprocessing behaviors, especially when using decision tree-based binning. Attributes: - column_transformer (ColumnTransformer): A sklearn ColumnTransformer instance that holds the configured - preprocessing pipelines for the different feature types. + column_transformer (ColumnTransformer): An instance of sklearn's ColumnTransformer that holds the + configured preprocessing pipelines for different feature types. + fitted (bool): Indicates whether the preprocessor has been fitted to the data. Methods: fit(X, y=None): Fits the preprocessor to the data, identifying feature types and configuring the @@ -215,13 +56,25 @@ def __init__( numerical_preprocessing="binning", use_decision_tree_bins=False, binning_strategy="uniform", + task="regression", ): self.n_bins = n_bins - self.numerical_preprocessing = numerical_preprocessing + self.numerical_preprocessing = numerical_preprocessing.lower() + if self.numerical_preprocessing not in [ + "ple", + "binning", + "one_hot", + "standardization", + "normalization", + ]: + raise ValueError( + "Invalid numerical_preprocessing value. Supported values are 'ple', 'binning', 'one_hot', 'standardization', and 'normalization'." + ) self.use_decision_tree_bins = use_decision_tree_bins self.column_transformer = None self.fitted = False self.binning_strategy = binning_strategy + self.task = task def set_params(self, **params): for key, value in params.items(): @@ -325,6 +178,12 @@ def fit(self, X, y=None): elif self.numerical_preprocessing == "normalization": numeric_transformer_steps.append(("normalizer", MinMaxScaler())) + elif self.numerical_preprocessing == "ple": + numeric_transformer_steps.append(("normalizer", MinMaxScaler())) + numeric_transformer_steps.append( + ("ple", PLE(n_bins=self.n_bins, task=self.task)) + ) + numeric_transformer = Pipeline(numeric_transformer_steps) transformers.append((f"num_{feature}", numeric_transformer, [feature])) @@ -380,53 +239,53 @@ def _get_decision_tree_bins(self, X, y, numerical_features): def transform(self, X): """ - Transforms the dataset using the fitted preprocessing pipelines. This method applies the transformations set up during the fitting process - to the input data and returns a dictionary with the transformed data. + Transforms the input data using the preconfigured column transformer and converts the output into a dictionary + format with keys corresponding to transformed feature names and values as arrays of transformed data. + + This method converts the sparse or dense matrix returned by the column transformer into a more accessible + dictionary format, where each key-value pair represents a feature and its transformed data. Parameters: - X (DataFrame or dict): The input dataset to be transformed. + X (DataFrame): The input data to be transformed. Returns: - dict: A dictionary where keys are the base feature names and values are the transformed features as arrays. + dict: A dictionary where keys are the names of the features (as per the transformations defined in the + column transformer) and the values are numpy arrays of the transformed data. """ - if not self.fitted: - raise NotFittedError( - "This Preprocessor instance is not fitted yet. Call 'fit' with appropriate arguments before using this method." - ) - - if isinstance(X, dict): - X = pd.DataFrame(X) - - # Transform X using the column transformer - transformed_X = self.column_transformer.transform( - X - ) # To understand the shape of the transformed data + transformed_X = self.column_transformer.transform(X) - # Initialize the transformed dictionary - transformed_dict = {} + # Now let's convert this into a dictionary of arrays, one per column + transformed_dict = self._split_transformed_output(X, transformed_X) + return transformed_dict - # Retrieve output feature names from the column transformer - output_features = self.column_transformer.get_feature_names_out() + def _split_transformed_output(self, X, transformed_X): + """ + Splits the transformed data array into a dictionary where keys correspond to the original column names or + feature groups and values are the transformed data for those columns. - # Iterate over each output feature name to populate the transformed_dict - for i, col in enumerate(output_features): - # Extract the base feature name (before any transformation) - base_feature = col.split("__")[0] + This helper method is utilized within `transform` to segregate the transformed data based on the + specification in the column transformer, assigning each transformed section to its corresponding feature name. - # If the base feature name already exists in the dictionary, append the new data - if base_feature in transformed_dict: - transformed_dict[base_feature] = np.vstack( - [transformed_dict[base_feature], transformed_X[:, i]] - ) - else: - # Otherwise, create a new entry in the dictionary - transformed_dict[base_feature] = transformed_X[:, i] + Parameters: + X (DataFrame): The original input data, used for determining shapes and transformations. + transformed_X (numpy array): The transformed data as a numpy array, outputted by the column transformer. - # Ensure all arrays in the dictionary are the correct shape - for key in transformed_dict.keys(): - transformed_dict[key] = ( - transformed_dict[key].reshape(-1, transformed_X.shape[0]).T - ) + Returns: + dict: A dictionary mapping each transformation's name to its respective numpy array of transformed data. + The type of each array (int or float) is determined based on the type of transformation applied. + """ + start = 0 + transformed_dict = {} + for ( + name, + transformer, + columns, + ) in self.column_transformer.transformers_: # skip 'remainder' + if transformer != "drop": + end = start + transformer.transform(X[[columns[0]]]).shape[1] + dtype = int if "cat" in name else float + transformed_dict[name] = transformed_X[:, start:end].astype(dtype) + start = end return transformed_dict @@ -447,12 +306,23 @@ def fit_transform(self, X, y=None): def get_feature_info(self): """ - Returns detailed information about the processed features, including the number of bins for binned features - and the dimensionality of encoded features. This method is useful for understanding the transformations applied to each feature. + Retrieves information about how features are encoded within the model's preprocessor. + This method identifies the type of encoding applied to each feature, categorizing them into binned or ordinal + encodings and other types of encodings (e.g., one-hot encoding after discretization). + + This method should only be called after the preprocessor has been fitted, as it relies on the structure and + configuration of the `column_transformer` attribute. + + Raises: + RuntimeError: If the `column_transformer` is not yet fitted, indicating that the preprocessor must be + fitted before invoking this method. Returns: - tuple: A tuple containing two dictionaries, the first with information about binned or ordinal encoded features and - the second with information about other encoded features. + tuple of (dict, dict): + - The first dictionary maps feature names to their respective number of bins or categories if they are + processed using discretization or ordinal encoding. + - The second dictionary includes feature names with other encoding details, such as the dimension of + features after encoding transformations (e.g., one-hot encoding dimensions). """ binned_or_ordinal_info = {} other_encoding_info = {} @@ -507,7 +377,7 @@ def get_feature_info(self): ) other_encoding_info[feature_name] = transformed_feature.shape[1] print( - f"Feature: {feature_name} ({self.numerical_preprocessing}), Encoded feature dimension: {transformed_feature.shape[1]}" + f"Feature: {feature_name} (Other Encoding), Encoded feature dimension: {transformed_feature.shape[1]}" ) print("-" * 50) diff --git a/setup.py b/setup.py index c6ae326..a7ee80f 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,6 @@ def read_requirements(): "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], - python_requires=">=3.6, <3.11", + python_requires=">=3.6, <=3.12.3", install_requires=read_requirements(), ) diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py index bdcc26b..fb43c64 100644 --- a/tests/test_preprocessor.py +++ b/tests/test_preprocessor.py @@ -41,7 +41,14 @@ def test_fit_transform(self): """Test fitting and transforming the data.""" pp = Preprocessor(numerical_preprocessing="standardization") transformed_data = pp.fit_transform(self.data) - print(transformed_data) + self.assertIsInstance(transformed_data, dict) + self.assertTrue("num_numerical" in transformed_data) + self.assertTrue("cat_categorical" in transformed_data) + + def test_ple(self): + """Test fitting and transforming the data.""" + pp = Preprocessor(numerical_preprocessing="ple", n_bins=20) + transformed_data = pp.fit_transform(self.data) self.assertIsInstance(transformed_data, dict) self.assertTrue("num_numerical" in transformed_data) self.assertTrue("cat_categorical" in transformed_data) From 3c17fe1f0becae3fbccc56de1dd55ddfe755669b Mon Sep 17 00:00:00 2001 From: thielmaf Date: Tue, 28 May 2024 07:29:03 +0000 Subject: [PATCH 02/21] test new preprocessing (ple) --- mambular/utils/mlp_utils.py | 245 ++++++++++++++++++++++++ mambular/utils/ple_encoding.py | 156 +++++++++++++++ mambular/utils/prepro_utils.py | 167 ++++++++++++++++ mambular/utils/preprocessor.py | 335 ++++++++++----------------------- setup.py | 55 +++--- tests/test_preprocessor.py | 9 +- 6 files changed, 703 insertions(+), 264 deletions(-) create mode 100644 mambular/utils/mlp_utils.py create mode 100644 mambular/utils/ple_encoding.py create mode 100644 mambular/utils/prepro_utils.py diff --git a/mambular/utils/mlp_utils.py b/mambular/utils/mlp_utils.py new file mode 100644 index 0000000..78dc209 --- /dev/null +++ b/mambular/utils/mlp_utils.py @@ -0,0 +1,245 @@ +import torch +import torch.nn as nn + + +class Linear_skip_block(nn.Module): + """ + A neural network block that includes a linear layer, an activation function, a dropout layer, and optionally a + skip connection and batch normalization. The skip connection is added if the input and output feature sizes are equal. + + Parameters + ---------- + n_input : int + The number of input features. + n_output : int + The number of output features. + dropout_rate : float + The rate of dropout to apply for regularization. + activation_fn : torch.nn.modules.activation, optional + The activation function to use after the linear layer. Default is nn.LeakyReLU(). + use_batch_norm : bool, optional + Whether to apply batch normalization after the activation function. Default is False. + + Attributes + ---------- + fc : torch.nn.Linear + The linear transformation layer. + act : torch.nn.Module + The activation function. + drop : torch.nn.Dropout + The dropout layer. + use_batch_norm : bool + Indicator of whether batch normalization is used. + batch_norm : torch.nn.BatchNorm1d, optional + The batch normalization layer, instantiated if use_batch_norm is True. + use_skip : bool + Indicator of whether a skip connection is used. + """ + + def __init__( + self, + n_input, + n_output, + dropout_rate, + activation_fn=nn.LeakyReLU(), + use_batch_norm=False, + ): + super(Linear_skip_block, self).__init__() + + self.fc = nn.Linear(n_input, n_output) + self.act = activation_fn + self.drop = nn.Dropout(dropout_rate) + self.use_batch_norm = use_batch_norm + self.use_skip = ( + n_input == n_output + ) # Only use skip connection if input and output sizes are equal + + if use_batch_norm: + self.batch_norm = nn.BatchNorm1d(n_output) # Initialize batch normalization + + def forward(self, x): + """ + Defines the forward pass of the Linear_block. + + Parameters + ---------- + x : Tensor + The input tensor to the block. + + Returns + ------- + Tensor + The output tensor after processing through the linear layer, activation function, dropout, + and optional batch normalization. + """ + x0 = x # Save input for possible skip connection + x = self.fc(x) + x = self.act(x) + + if self.use_batch_norm: + x = self.batch_norm(x) # Apply batch normalization after activation + + if self.use_skip: + x = x + x0 # Add skip connection if applicable + + x = self.drop(x) # Apply dropout + return x + + +class Linear_block(nn.Module): + """ + A neural network block that includes a linear layer, an activation function, a dropout layer, and optionally batch normalization. + + Parameters + ---------- + n_input : int + The number of input features. + n_output : int + The number of output features. + dropout_rate : float + The rate of dropout to apply. + activation_fn : torch.nn.modules.activation, optional + The activation function to use after the linear layer. Default is nn.LeakyReLU(). + batch_norm : bool, optional + Whether to include batch normalization after the activation function. Default is False. + + Attributes + ---------- + block : torch.nn.Sequential + A sequential container holding the linear layer, activation function, dropout, and optionally batch normalization. + """ + + def __init__( + self, + n_input, + n_output, + dropout_rate, + activation_fn=nn.LeakyReLU(), + batch_norm=False, + ): + super(Linear_block, self).__init__() + + # Initialize modules + modules = [ + nn.Linear(n_input, n_output), + activation_fn, + nn.Dropout(dropout_rate), + ] + + # Optionally add batch normalization + if batch_norm: + modules.append(nn.BatchNorm1d(n_output)) + + # Create the sequential model + self.block = nn.Sequential(*modules) + + def forward(self, x): + """ + Defines the forward pass of the Linear_block. + + Parameters + ---------- + x : Tensor + The input tensor to the block. + + Returns + ------- + Tensor + The output tensor after processing through the linear layer, activation function, dropout, + and optional batch normalization. + """ + # Pass the input through the block + return self.block(x) + + +class MLP(nn.Module): + """ + A multi-layer perceptron (MLP) for regression tasks, configurable with optional skip connections and batch normalization. + + Parameters + ---------- + n_input_units : int + The number of units in the input layer. + hidden_units_list : list of int + A list specifying the number of units in each hidden layer. + n_output_units : int + The number of units in the output layer. + dropout_rate : float + The dropout rate used across the MLP. + use_skip_layers : bool, optional + Whether to use skip connections in layers where input and output sizes match. Default is False. + activation_fn : torch.nn.modules.activation, optional + The activation function used across the layers. Default is nn.LeakyReLU(). + use_batch_norm : bool, optional + Whether to apply batch normalization in each layer. Default is False. + + Attributes + ---------- + hidden_layers : torch.nn.Sequential + Sequential container of layers comprising the MLP's hidden layers. + linear_final : torch.nn.Linear + The final linear layer of the MLP. + """ + + def __init__( + self, + n_input_units, + hidden_units_list=[64, 32, 32], + n_output_units: int = 1, + dropout_rate: float = 0.1, + use_skip_layers: bool = False, + activation_fn=nn.LeakyReLU(), + use_batch_norm: bool = False, + ): + super(MLP, self).__init__() + self.n_input_units = n_input_units + self.hidden_units_list = hidden_units_list + self.dropout_rate = dropout_rate + self.n_output_units = n_output_units + + layers = [] + input_units = n_input_units + + for n_hidden_units in hidden_units_list: + if use_skip_layers and input_units == n_hidden_units: + layers.append( + Linear_skip_block( + input_units, + n_hidden_units, + dropout_rate, + activation_fn, + use_batch_norm, + ) + ) + else: + layers.append( + Linear_block( + input_units, + n_hidden_units, + dropout_rate, + activation_fn, + use_batch_norm, + ) + ) + input_units = n_hidden_units # Update input_units for the next layer + + self.hidden_layers = nn.Sequential(*layers) + self.linear_final = nn.Linear(input_units, n_output_units) # Final layer + + def forward(self, x): + """ + Defines the forward pass of the MLP. + + Parameters + ---------- + x : Tensor + The input tensor to the MLP. + + Returns + ------- + Tensor + The output predictions of the model for regression tasks. + """ + x = self.hidden_layers(x) + x = self.linear_final(x) + return x diff --git a/mambular/utils/ple_encoding.py b/mambular/utils/ple_encoding.py new file mode 100644 index 0000000..972b36b --- /dev/null +++ b/mambular/utils/ple_encoding.py @@ -0,0 +1,156 @@ +import numpy as np +from tqdm import tqdm +import pandas as pd +import bisect +import re +from sklearn.tree import _tree +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +import pandas as pd +import numpy as np +from sklearn.base import TransformerMixin, BaseEstimator +from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier + + +def tree_to_code(tree, feature_names): + """ + Convert a scikit-learn decision tree into a list of conditions. + + Args: + tree (sklearn.tree.DecisionTreeRegressor or sklearn.tree.DecisionTreeClassifier): + The decision tree model to be converted. + feature_names (list of str): The names of the features used in the tree. + Y (array-like): The target values associated with the tree. + + Returns: + list of str: A list of conditions representing the decision tree paths. + + Example: + # Convert a decision tree into a list of conditions + tree_conditions = tree_to_code(tree_model, feature_names, target_values) + """ + + tree_ = tree.tree_ + feature_name = [ + feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" + for i in tree_.feature + ] + + pathto = dict() + my_list = [] + + global k + k = 0 + + def recurse(node, depth, parent): + global k + indent = " " * depth + + if tree_.feature[node] != _tree.TREE_UNDEFINED: + # name = df_name + "[" + "'" + feature_name[node]+ "'" + "]" + name = feature_name[node] + threshold = tree_.threshold[node] + s = "{} <= {} ".format(name, threshold, node) + if node == 0: + pathto[node] = "(" + s + ")" + else: + pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")" + + recurse(tree_.children_left[node], depth + 1, node) + s = "{} > {}".format(name, threshold) + if node == 0: + pathto[node] = s + else: + pathto[node] = "(" + pathto[parent] + ")" + " & " + "(" + s + ")" + recurse(tree_.children_right[node], depth + 1, node) + else: + k = k + 1 + my_list.append(pathto[parent]) + # print(k,')',pathto[parent], tree_.value[node]) + + recurse(0, 1, 0) + + return my_list + + +class PLE(BaseEstimator, TransformerMixin): + def __init__( + self, n_bins=20, tree_params={}, task="regression", conditions=None, **kwargs + ): + super(PLE, self).__init__(**kwargs) + + self.task = task + self.tree_params = tree_params + self.n_bins = n_bins + self.conditions = conditions + self.pattern = ( + r"-?\d+\.?\d*[eE]?[+-]?\d*" # This pattern matches integers and floats + ) + + def fit(self, feature, target): + if self.task == "regression": + dt = DecisionTreeRegressor(max_leaf_nodes=self.n_bins) + elif self.task == "classification": + dt = DecisionTreeClassifier(max_leaf_nodes=self.n_bins) + else: + raise ValueError("This task is not supported") + + dt.fit(feature, target) + + self.conditions = tree_to_code(dt, ["feature"]) + return self + + def transform(self, feature): + if feature.shape == (feature.shape[0], 1): + feature = np.squeeze(feature, axis=1) + else: + feature = feature + result_list = [] + for idx, cond in enumerate(self.conditions): + result_list.append(eval(cond) * (idx + 1)) + + encoded_feature = np.expand_dims(np.sum(np.stack(result_list).T, axis=1), 1) + + encoded_feature = np.array(encoded_feature - 1, dtype=np.int64) + + # Initialize an empty list to store the extracted numbers + locations = [] + # Iterate through the strings and extract numbers + for string in self.conditions: + matches = re.findall(self.pattern, string) + locations.extend(matches) + + locations = [float(number) for number in locations] + locations = list(set(locations)) + locations = np.sort(locations) + + ple_encoded_feature = np.zeros((len(feature), locations.shape[0] + 1)) + if locations[-1] > np.max(feature): + locations[-1] = np.max(feature) + + for idx in range(len(encoded_feature)): + if feature[idx] >= locations[-1]: + ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx] + ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1 + elif feature[idx] <= locations[0]: + ple_encoded_feature[idx][encoded_feature[idx]] = feature[idx] + + else: + ple_encoded_feature[idx][encoded_feature[idx]] = ( + feature[idx] - locations[(encoded_feature[idx] - 1)[0]] + ) / ( + locations[(encoded_feature[idx])[0]] + - locations[(encoded_feature[idx] - 1)[0]] + ) + + ple_encoded_feature[idx, : encoded_feature[idx][0]] = 1 + + if ple_encoded_feature.shape[1] == 1: + return np.zeros([len(feature), self.n_bins]) + + else: + return np.array(ple_encoded_feature, dtype=np.float32) + + def get_feature_names_out(self, input_features=None): + if input_features is None: + raise ValueError("input_features must be specified") + return input_features diff --git a/mambular/utils/prepro_utils.py b/mambular/utils/prepro_utils.py new file mode 100644 index 0000000..4bb9fa7 --- /dev/null +++ b/mambular/utils/prepro_utils.py @@ -0,0 +1,167 @@ +import pandas as pd +import numpy as np +from sklearn.base import TransformerMixin, BaseEstimator + + +class CustomBinner(TransformerMixin): + def __init__(self, bins): + # bins can be a scalar (number of bins) or array-like (bin edges) + self.bins = bins + + def fit(self, X, y=None): + # Fit doesn't need to do anything as we are directly using provided bins + return self + + def transform(self, X): + if isinstance(self.bins, int): + # Calculate equal width bins based on the range of the data and number of bins + _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True) + else: + # Use predefined bins + bins = self.bins + + # Apply the bins to the data + binned_data = pd.cut( + X.squeeze(), + bins=np.sort(np.unique(bins)), + labels=False, + include_lowest=True, + ) + print(binned_data) + return np.expand_dims(np.array(binned_data), 1) + + +class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin): + """ + This encoder converts categorical features into continuous integer values. Each unique category within a feature + is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for + models that can only handle continuous data. + + Attributes: + mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers + for a single feature. + + Methods: + fit(X, y=None): Learns the mapping from original categories to integers. + transform(X): Applies the learned mapping to the data. + get_feature_names_out(input_features=None): Returns the input features after transformation. + """ + + def fit(self, X, y=None): + """ + Learns the mapping from original categories to integers for each feature. + + Parameters: + X (array-like of shape (n_samples, n_features)): The input data to fit. + y (ignored): Not used, present for API consistency by convention. + + Returns: + self: Returns the instance itself. + """ + # Fit should determine the mapping from original categories to sequential integers starting from 0 + self.mapping_ = [ + {category: i for i, category in enumerate(np.unique(col))} for col in X.T + ] + return self + + def transform(self, X): + """ + Transforms the categories in X to their corresponding integer values based on the learned mapping. + + Parameters: + X (array-like of shape (n_samples, n_features)): The input data to transform. + + Returns: + X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values. + """ + # Transform the categories to their mapped integer values + X_transformed = np.array( + [ + [self.mapping_[col].get(value, -1) for col, value in enumerate(row)] + for row in X + ] + ) + return X_transformed + + def get_feature_names_out(self, input_features=None): + """ + Returns the names of the transformed features. + + Parameters: + input_features (list of str): The names of the input features. + + Returns: + input_features (array of shape (n_features,)): The names of the output features after transformation. + """ + if input_features is None: + raise ValueError("input_features must be specified") + return input_features + + +class OneHotFromOrdinal(TransformerMixin, BaseEstimator): + """ + A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful + in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required + for model training. + + Attributes: + max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature, + determining the size of the one-hot encoded array for that feature. + + Methods: + fit(X, y=None): Learns the maximum bin index for each feature. + transform(X): Converts ordinal-encoded features into one-hot format. + get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding. + """ + + def fit(self, X, y=None): + """ + Learns the maximum bin index for each feature from the data. + + Parameters: + X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features. + y (ignored): Not used, present for API consistency by convention. + + Returns: + self: Returns the instance itself. + """ + self.max_bins_ = ( + np.max(X, axis=0).astype(int) + 1 + ) # Find the maximum bin index for each feature + return self + + def transform(self, X): + """ + Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting. + + Parameters: + X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features. + + Returns: + X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features. + """ + # Initialize an empty list to hold the one-hot encoded arrays + one_hot_encoded = [] + for i, max_bins in enumerate(self.max_bins_): + # Convert each feature to one-hot using its max_bins + feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)] + one_hot_encoded.append(feature_one_hot) + # Concatenate the one-hot encoded features horizontally + return np.hstack(one_hot_encoded) + + def get_feature_names_out(self, input_features=None): + """ + Generates feature names for the one-hot encoded features based on the input feature names and the number of bins. + + Parameters: + input_features (list of str): The names of the input features that were ordinal-encoded. + + Returns: + feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features. + """ + feature_names = [] + for i, max_bins in enumerate(self.max_bins_): + feature_names.extend( + [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))] + ) + return np.array(feature_names) diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py index 2056c69..b754951 100644 --- a/mambular/utils/preprocessor.py +++ b/mambular/utils/preprocessor.py @@ -1,180 +1,17 @@ import numpy as np import pandas as pd -from sklearn.base import BaseEstimator, TransformerMixin +import numpy as np +from .prepro_utils import OneHotFromOrdinal, CustomBinner, ContinuousOrdinalEncoder +from sklearn.preprocessing import ( + StandardScaler, + KBinsDiscretizer, + MinMaxScaler, +) +from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.compose import ColumnTransformer -from sklearn.exceptions import NotFittedError -from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline -from sklearn.preprocessing import (KBinsDiscretizer, MinMaxScaler, - StandardScaler) -from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor - -__all__ = ['Preprocessor'] - - -class CustomBinner(TransformerMixin): - def __init__(self, bins): - # bins can be a scalar (number of bins) or array-like (bin edges) - self.bins = bins - - def fit(self, X, y=None): - # Fit doesn't need to do anything as we are directly using provided bins - return self - - def transform(self, X): - if isinstance(self.bins, int): - # Calculate equal width bins based on the range of the data and number of bins - _, bins = pd.cut(X.squeeze(), bins=self.bins, retbins=True) - else: - # Use predefined bins - bins = self.bins - - # Apply the bins to the data - binned_data = pd.cut( - X.squeeze(), - bins=np.sort(np.unique(bins)), - labels=False, - include_lowest=True, - ) - print(binned_data) - return np.expand_dims(np.array(binned_data), 1) - - -class ContinuousOrdinalEncoder(BaseEstimator, TransformerMixin): - """ - This encoder converts categorical features into continuous integer values. Each unique category within a feature - is assigned a unique integer based on its order of appearance in the dataset. This transformation is useful for - models that can only handle continuous data. - - Attributes: - mapping_ (list of dicts): A list where each element is a dictionary mapping original categories to integers - for a single feature. - - Methods: - fit(X, y=None): Learns the mapping from original categories to integers. - transform(X): Applies the learned mapping to the data. - get_feature_names_out(input_features=None): Returns the input features after transformation. - """ - - def fit(self, X, y=None): - """ - Learns the mapping from original categories to integers for each feature. - - Parameters: - X (array-like of shape (n_samples, n_features)): The input data to fit. - y (ignored): Not used, present for API consistency by convention. - - Returns: - self: Returns the instance itself. - """ - # Fit should determine the mapping from original categories to sequential integers starting from 0 - self.mapping_ = [ - {category: i for i, category in enumerate(np.unique(col))} for col in X.T - ] - return self - - def transform(self, X): - """ - Transforms the categories in X to their corresponding integer values based on the learned mapping. - - Parameters: - X (array-like of shape (n_samples, n_features)): The input data to transform. - - Returns: - X_transformed (ndarray of shape (n_samples, n_features)): The transformed data with integer values. - """ - # Transform the categories to their mapped integer values - X_transformed = np.array( - [ - [self.mapping_[col].get(value, -1) - for col, value in enumerate(row)] - for row in X - ] - ) - return X_transformed - - def get_feature_names_out(self, input_features=None): - """ - Returns the names of the transformed features. - - Parameters: - input_features (list of str): The names of the input features. - - Returns: - input_features (array of shape (n_features,)): The names of the output features after transformation. - """ - if input_features is None: - raise ValueError("input_features must be specified") - return input_features - - -class OneHotFromOrdinal(TransformerMixin, BaseEstimator): - """ - A transformer that takes ordinal-encoded features and converts them into one-hot encoded format. This is useful - in scenarios where features have been pre-encoded with ordinal encoding and a one-hot representation is required - for model training. - - Attributes: - max_bins_ (ndarray of shape (n_features,)): An array containing the maximum bin index for each feature, - determining the size of the one-hot encoded array for that feature. - - Methods: - fit(X, y=None): Learns the maximum bin index for each feature. - transform(X): Converts ordinal-encoded features into one-hot format. - get_feature_names_out(input_features=None): Returns the feature names after one-hot encoding. - """ - - def fit(self, X, y=None): - """ - Learns the maximum bin index for each feature from the data. - - Parameters: - X (array-like of shape (n_samples, n_features)): The input data to fit, containing ordinal-encoded features. - y (ignored): Not used, present for API consistency by convention. - - Returns: - self: Returns the instance itself. - """ - self.max_bins_ = ( - np.max(X, axis=0).astype(int) + 1 - ) # Find the maximum bin index for each feature - return self - - def transform(self, X): - """ - Transforms ordinal-encoded features into one-hot encoded format based on the `max_bins_` learned during fitting. - - Parameters: - X (array-like of shape (n_samples, n_features)): The input data to transform, containing ordinal-encoded features. - - Returns: - X_one_hot (ndarray of shape (n_samples, n_output_features)): The one-hot encoded features. - """ - # Initialize an empty list to hold the one-hot encoded arrays - one_hot_encoded = [] - for i, max_bins in enumerate(self.max_bins_): - # Convert each feature to one-hot using its max_bins - feature_one_hot = np.eye(max_bins)[X[:, i].astype(int)] - one_hot_encoded.append(feature_one_hot) - # Concatenate the one-hot encoded features horizontally - return np.hstack(one_hot_encoded) - - def get_feature_names_out(self, input_features=None): - """ - Generates feature names for the one-hot encoded features based on the input feature names and the number of bins. - - Parameters: - input_features (list of str): The names of the input features that were ordinal-encoded. - - Returns: - feature_names (array of shape (n_output_features,)): The names of the one-hot encoded features. - """ - feature_names = [] - for i, max_bins in enumerate(self.max_bins_): - feature_names.extend( - [f"{input_features[i]}_bin_{j}" for j in range(int(max_bins))] - ) - return np.array(feature_names) +from sklearn.impute import SimpleImputer +from .ple_encoding import PLE class Preprocessor: @@ -196,11 +33,15 @@ class Preprocessor: use_decision_tree_bins (bool): If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. This parameter is relevant only if `numerical_preprocessing` is set to 'binning' or 'one_hot'. + binning_strategy (str): Defines the strategy for binning numerical features. Options include 'uniform', + 'quantile', or other sklearn-compatible strategies. + task (str): Indicates the type of machine learning task ('regression' or 'classification'). This can + influence certain preprocessing behaviors, especially when using decision tree-based binning. - Attributes - ---------- - column_transformer (ColumnTransformer): A sklearn ColumnTransformer instance that holds the configured - preprocessing pipelines for the different feature types. + Attributes: + column_transformer (ColumnTransformer): An instance of sklearn's ColumnTransformer that holds the + configured preprocessing pipelines for different feature types. + fitted (bool): Indicates whether the preprocessor has been fitted to the data. """ @@ -210,13 +51,25 @@ def __init__( numerical_preprocessing="binning", use_decision_tree_bins=False, binning_strategy="uniform", + task="regression", ): self.n_bins = n_bins - self.numerical_preprocessing = numerical_preprocessing + self.numerical_preprocessing = numerical_preprocessing.lower() + if self.numerical_preprocessing not in [ + "ple", + "binning", + "one_hot", + "standardization", + "normalization", + ]: + raise ValueError( + "Invalid numerical_preprocessing value. Supported values are 'ple', 'binning', 'one_hot', 'standardization', and 'normalization'." + ) self.use_decision_tree_bins = use_decision_tree_bins self.column_transformer = None self.fitted = False self.binning_strategy = binning_strategy + self.task = task def set_params(self, **params): for key, value in params.items(): @@ -244,8 +97,7 @@ def _detect_column_types(self, X): num_unique_values = X[col].nunique() total_samples = len(X[col]) if X[col].dtype.kind not in "iufc" or ( - X[col].dtype.kind == "i" and ( - num_unique_values / total_samples) < 0.05 + X[col].dtype.kind == "i" and (num_unique_values / total_samples) < 0.05 ): categorical_features.append(col) else: @@ -278,8 +130,7 @@ def fit(self, X, y=None): if self.numerical_preprocessing in ["binning", "one_hot"]: bins = ( - self._get_decision_tree_bins( - X[[feature]], y, [feature]) + self._get_decision_tree_bins(X[[feature]], y, [feature]) if self.use_decision_tree_bins else self.n_bins ) @@ -294,8 +145,7 @@ def fit(self, X, y=None): else len(bins) - 1, encode="ordinal", strategy=self.binning_strategy, - subsample=200_000 if len( - X) > 200_000 else None, + subsample=200_000 if len(X) > 200_000 else None, ), ), ] @@ -318,17 +168,20 @@ def fit(self, X, y=None): ) elif self.numerical_preprocessing == "standardization": - numeric_transformer_steps.append( - ("scaler", StandardScaler())) + numeric_transformer_steps.append(("scaler", StandardScaler())) elif self.numerical_preprocessing == "normalization": + numeric_transformer_steps.append(("normalizer", MinMaxScaler())) + + elif self.numerical_preprocessing == "ple": + numeric_transformer_steps.append(("normalizer", MinMaxScaler())) numeric_transformer_steps.append( - ("normalizer", MinMaxScaler())) + ("ple", PLE(n_bins=self.n_bins, task=self.task)) + ) numeric_transformer = Pipeline(numeric_transformer_steps) - transformers.append( - (f"num_{feature}", numeric_transformer, [feature])) + transformers.append((f"num_{feature}", numeric_transformer, [feature])) if categorical_features: for feature in categorical_features: @@ -375,60 +228,59 @@ def _get_decision_tree_bins(self, X, y, numerical_features): bin_edges = np.sort(np.unique(thresholds)) bins.append( - np.concatenate( - ([X[feature].min()], bin_edges, [X[feature].max()])) + np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()])) ) return bins def transform(self, X): """ - Transforms the dataset using the fitted preprocessing pipelines. This method applies the transformations set up during the fitting process - to the input data and returns a dictionary with the transformed data. + Transforms the input data using the preconfigured column transformer and converts the output into a dictionary + format with keys corresponding to transformed feature names and values as arrays of transformed data. + + This method converts the sparse or dense matrix returned by the column transformer into a more accessible + dictionary format, where each key-value pair represents a feature and its transformed data. Parameters: - X (DataFrame or dict): The input dataset to be transformed. + X (DataFrame): The input data to be transformed. Returns: - dict: A dictionary where keys are the base feature names and values are the transformed features as arrays. + dict: A dictionary where keys are the names of the features (as per the transformations defined in the + column transformer) and the values are numpy arrays of the transformed data. """ - if not self.fitted: - raise NotFittedError( - "This Preprocessor instance is not fitted yet. Call 'fit' with appropriate arguments before using this method." - ) + transformed_X = self.column_transformer.transform(X) - if isinstance(X, dict): - X = pd.DataFrame(X) - - # Transform X using the column transformer - transformed_X = self.column_transformer.transform( - X - ) # To understand the shape of the transformed data - - # Initialize the transformed dictionary - transformed_dict = {} + # Now let's convert this into a dictionary of arrays, one per column + transformed_dict = self._split_transformed_output(X, transformed_X) + return transformed_dict - # Retrieve output feature names from the column transformer - output_features = self.column_transformer.get_feature_names_out() + def _split_transformed_output(self, X, transformed_X): + """ + Splits the transformed data array into a dictionary where keys correspond to the original column names or + feature groups and values are the transformed data for those columns. - # Iterate over each output feature name to populate the transformed_dict - for i, col in enumerate(output_features): - # Extract the base feature name (before any transformation) - base_feature = col.split("__")[0] + This helper method is utilized within `transform` to segregate the transformed data based on the + specification in the column transformer, assigning each transformed section to its corresponding feature name. - # If the base feature name already exists in the dictionary, append the new data - if base_feature in transformed_dict: - transformed_dict[base_feature] = np.vstack( - [transformed_dict[base_feature], transformed_X[:, i]] - ) - else: - # Otherwise, create a new entry in the dictionary - transformed_dict[base_feature] = transformed_X[:, i] + Parameters: + X (DataFrame): The original input data, used for determining shapes and transformations. + transformed_X (numpy array): The transformed data as a numpy array, outputted by the column transformer. - # Ensure all arrays in the dictionary are the correct shape - for key in transformed_dict.keys(): - transformed_dict[key] = ( - transformed_dict[key].reshape(-1, transformed_X.shape[0]).T - ) + Returns: + dict: A dictionary mapping each transformation's name to its respective numpy array of transformed data. + The type of each array (int or float) is determined based on the type of transformation applied. + """ + start = 0 + transformed_dict = {} + for ( + name, + transformer, + columns, + ) in self.column_transformer.transformers_: # skip 'remainder' + if transformer != "drop": + end = start + transformer.transform(X[[columns[0]]]).shape[1] + dtype = int if "cat" in name else float + transformed_dict[name] = transformed_X[:, start:end].astype(dtype) + start = end return transformed_dict @@ -449,12 +301,23 @@ def fit_transform(self, X, y=None): def get_feature_info(self): """ - Returns detailed information about the processed features, including the number of bins for binned features - and the dimensionality of encoded features. This method is useful for understanding the transformations applied to each feature. + Retrieves information about how features are encoded within the model's preprocessor. + This method identifies the type of encoding applied to each feature, categorizing them into binned or ordinal + encodings and other types of encodings (e.g., one-hot encoding after discretization). + + This method should only be called after the preprocessor has been fitted, as it relies on the structure and + configuration of the `column_transformer` attribute. + + Raises: + RuntimeError: If the `column_transformer` is not yet fitted, indicating that the preprocessor must be + fitted before invoking this method. Returns: - tuple: A tuple containing two dictionaries, the first with information about binned or ordinal encoded features and - the second with information about other encoded features. + tuple of (dict, dict): + - The first dictionary maps feature names to their respective number of bins or categories if they are + processed using discretization or ordinal encoding. + - The second dictionary includes feature names with other encoding details, such as the dimension of + features after encoding transformations (e.g., one-hot encoding dimensions). """ binned_or_ordinal_info = {} other_encoding_info = {} @@ -473,8 +336,7 @@ def get_feature_info(self): # Handle features processed with discretization if "discretizer" in steps: step = transformer_pipeline.named_steps["discretizer"] - n_bins = step.n_bins_[0] if hasattr( - step, "n_bins_") else None + n_bins = step.n_bins_[0] if hasattr(step, "n_bins_") else None # Check if discretization is followed by one-hot encoding if "onehot_from_ordinal" in steps: @@ -495,8 +357,7 @@ def get_feature_info(self): # Handle features processed with continuous ordinal encoding elif "continuous_ordinal" in steps: step = transformer_pipeline.named_steps["continuous_ordinal"] - n_categories = len( - step.mapping_[columns.index(feature_name)]) + n_categories = len(step.mapping_[columns.index(feature_name)]) binned_or_ordinal_info[feature_name] = n_categories print( f"Categorical Feature (Ordinal Encoded): {feature_name}, Number of unique categories: {n_categories}" @@ -511,7 +372,7 @@ def get_feature_info(self): ) other_encoding_info[feature_name] = transformed_feature.shape[1] print( - f"Feature: {feature_name} ({self.numerical_preprocessing}), Encoded feature dimension: {transformed_feature.shape[1]}" + f"Feature: {feature_name} (Other Encoding), Encoded feature dimension: {transformed_feature.shape[1]}" ) print("-" * 50) diff --git a/setup.py b/setup.py index ee0c16d..5ce6018 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ from setuptools import find_packages, setup # Package meta-data. -NAME = 'mambular' +NAME = "mambular" DESCRIPTION = "A python package for tabular deep learning with mamba blocks." HOMEPAGE = "https://github.com/basf/mamba-tabular" DOCS = "https://mambular.readthedocs.io/en/latest/index.html" @@ -16,7 +16,7 @@ # Load the package's verison file and its content. ROOT_DIR = Path(__file__).resolve().parent -PACKAGE_DIR = ROOT_DIR / 'mambular' +PACKAGE_DIR = ROOT_DIR / "mambular" with open(PACKAGE_DIR / "__version__.py") as f: VERSION = f.readlines()[-1].split()[-1].strip("\"'") @@ -24,33 +24,36 @@ # ger install_reqs from requirements file, used for setup function later with open(os.path.join(ROOT_DIR, "requirements.txt")) as f: # next(f) - install_reqs = [line.rstrip() for line in f.readlines() - if not line.startswith("#") and not line.startswith("git+")] + install_reqs = [ + line.rstrip() + for line in f.readlines() + if not line.startswith("#") and not line.startswith("git+") + ] # get long description from readme file with open(os.path.join(ROOT_DIR, "README.md")) as f: LONG_DESCRIPTION = f.read() -setup(name=NAME, - version=VERSION, - description=DESCRIPTION, - long_description=LONG_DESCRIPTION, - long_description_content_type="text/markdown", - author=AUTHOR, - author_email=EMAIL, - python_requires=REQUIRES_PYTHON, - install_requires=install_reqs, - # extras_require=extras_reqs, - license="Copyright (c) 2024 BASF SE", # adapt based on your needs - packages=find_packages(), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - include_package_data=True, - project_urls={'Homepage:': HOMEPAGE, - 'Documentation': DOCS}, - url=HOMEPAGE - ) +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, + long_description_content_type="text/markdown", + author=AUTHOR, + author_email=EMAIL, + python_requires=REQUIRES_PYTHON, + install_requires=install_reqs, + # extras_require=extras_reqs, + license="Copyright (c) 2024 BASF SE", # adapt based on your needs + packages=find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + include_package_data=True, + project_urls={"Homepage:": HOMEPAGE, "Documentation": DOCS}, + url=HOMEPAGE, +) diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py index bdcc26b..fb43c64 100644 --- a/tests/test_preprocessor.py +++ b/tests/test_preprocessor.py @@ -41,7 +41,14 @@ def test_fit_transform(self): """Test fitting and transforming the data.""" pp = Preprocessor(numerical_preprocessing="standardization") transformed_data = pp.fit_transform(self.data) - print(transformed_data) + self.assertIsInstance(transformed_data, dict) + self.assertTrue("num_numerical" in transformed_data) + self.assertTrue("cat_categorical" in transformed_data) + + def test_ple(self): + """Test fitting and transforming the data.""" + pp = Preprocessor(numerical_preprocessing="ple", n_bins=20) + transformed_data = pp.fit_transform(self.data) self.assertIsInstance(transformed_data, dict) self.assertTrue("num_numerical" in transformed_data) self.assertTrue("cat_categorical" in transformed_data) From 4e530a1d0e935c385fb46b3f08486c3474bc5285 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 07:32:41 +0000 Subject: [PATCH 03/21] include ple encodings --- mambular/utils/preprocessor.py | 39 +--------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py index fef0dc2..767e4f1 100644 --- a/mambular/utils/preprocessor.py +++ b/mambular/utils/preprocessor.py @@ -58,7 +58,6 @@ def __init__( use_decision_tree_bins=False, binning_strategy="uniform", task="regression", - task="regression", ): self.n_bins = n_bins self.numerical_preprocessing = numerical_preprocessing.lower() @@ -284,32 +283,6 @@ def transform(self, X): transformed_dict = self._split_transformed_output(X, transformed_X) return transformed_dict - def _split_transformed_output(self, X, transformed_X): - """ - Splits the transformed data array into a dictionary where keys correspond to the original column names or - feature groups and values are the transformed data for those columns. - - This helper method is utilized within `transform` to segregate the transformed data based on the - specification in the column transformer, assigning each transformed section to its corresponding feature name. - - Parameters: - X (DataFrame): The original input data, used for determining shapes and transformations. - transformed_X (numpy array): The transformed data as a numpy array, outputted by the column transformer. - - Returns: - dict: A dictionary mapping each transformation's name to its respective numpy array of transformed data. - The type of each array (int or float) is determined based on the type of transformation applied. - """ - start = 0 - dict: A dictionary where keys are the names of the features (as per the transformations defined in the - column transformer) and the values are numpy arrays of the transformed data. - """ - transformed_X = self.column_transformer.transform(X) - - # Now let's convert this into a dictionary of arrays, one per column - transformed_dict = self._split_transformed_output(X, transformed_X) - return transformed_dict - def _split_transformed_output(self, X, transformed_X): """ Splits the transformed data array into a dictionary where keys correspond to the original column names or @@ -332,17 +305,7 @@ def _split_transformed_output(self, X, transformed_X): name, transformer, columns, - ) in self.column_transformer.transformers_: # skip 'remainder' - if transformer != "drop": - end = start + transformer.transform(X[[columns[0]]]).shape[1] - dtype = int if "cat" in name else float - transformed_dict[name] = transformed_X[:, start:end].astype(dtype) - start = end - for ( - name, - transformer, - columns, - ) in self.column_transformer.transformers_: # skip 'remainder' + ) in self.column_transformer.transformers_: if transformer != "drop": end = start + transformer.transform(X[[columns[0]]]).shape[1] dtype = int if "cat" in name else float From 1098f9450a96a9b180689e9de0166363f255984f Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 11:37:58 +0000 Subject: [PATCH 04/21] restructure regression module --- mambular/base_models/embedding_regressor.py | 40 ++-- mambular/base_models/regressor.py | 194 ++++++++++--------- mambular/models/sklearn_regressor.py | 201 +++++++++++++------- mambular/utils/default_mamba_params.py | 34 ++++ mambular/utils/mamba_arch.py | 191 +++++++++++++++---- mambular/utils/normalization_layers.py | 1 - mambular/utils/preprocessor.py | 4 +- 7 files changed, 441 insertions(+), 224 deletions(-) create mode 100644 mambular/utils/default_mamba_params.py diff --git a/mambular/base_models/embedding_regressor.py b/mambular/base_models/embedding_regressor.py index 2b9904e..d7c3b93 100644 --- a/mambular/base_models/embedding_regressor.py +++ b/mambular/base_models/embedding_regressor.py @@ -4,6 +4,7 @@ from ..utils.config import MambularConfig from ..utils.mamba_arch import Mamba +from ..utils.mlp_utils import MLP class BaseEmbeddingMambularRegressor(pl.LightningModule): @@ -57,6 +58,12 @@ def __init__( lr_factor=0.75, seq_size: int = 20, raw_embeddings=False, + head_layer_sizes=[64, 32, 32], + head_dropout: float = 0.3, + head_skip_layers: bool = False, + head_activation="leakyrelu", + head_use_batch_norm: bool = False, + attn_dropout: float = 0.3, ): super().__init__() @@ -97,8 +104,7 @@ def __init__( self.num_embeddings = nn.ModuleList( [ nn.Sequential( - nn.Linear(self.seq_size, - self.config.d_model, bias=False), + nn.Linear(self.seq_size, self.config.d_model, bias=False), # Example using ReLU as the activation function, change as needed self.embedding_activation, ) @@ -128,26 +134,17 @@ def __init__( self.mamba = Mamba(self.config) self.norm_f = self.config.norm(self.config.d_model) - mlp_activation_fn = activations.get( - self.config.tabular_head_activation.lower(), nn.Identity() - ) - - # Dynamically create MLP layers based on config.tabular_units - mlp_layers = [] - input_dim = self.config.d_model # Initial input dimension - - # Iterate over the specified units for each layer in the MLP - for units in self.config.tabular_head_units: - mlp_layers.append(nn.Linear(input_dim, units)) - mlp_layers.append(mlp_activation_fn) - mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout)) - input_dim = units - - # Add the final linear layer to map to a single output value - mlp_layers.append(nn.Linear(input_dim, 1)) + head_activation = activations.get(head_activation.lower(), nn.Identity()) # Combine all layers into a Sequential module - self.tabular_head = nn.Sequential(*mlp_layers) + self.tabular_head = MLP( + self.config.d_model, + hidden_units_list=head_layer_sizes, + dropout_rate=head_dropout, + use_skip_layers=head_skip_layers, + activation_fn=head_activation, + use_batch_norm=head_use_batch_norm, + ) self.pooling_method = self.config.pooling_method self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model)) @@ -176,8 +173,7 @@ def forward(self, cat_features, num_features): The output predictions of the model for regression tasks. """ batch_size = ( - cat_features[0].size(0) if cat_features != [ - ] else num_features[0].size(0) + cat_features[0].size(0) if cat_features != [] else num_features[0].size(0) ) cls_tokens = self.cls_token.expand(batch_size, -1, -1) # Process categorical features if present diff --git a/mambular/base_models/regressor.py b/mambular/base_models/regressor.py index 3b0f721..2ff671c 100644 --- a/mambular/base_models/regressor.py +++ b/mambular/base_models/regressor.py @@ -1,68 +1,38 @@ import lightning as pl import torch import torch.nn as nn - -from ..utils.config import MambularConfig from ..utils.mamba_arch import Mamba +from ..utils.mlp_utils import MLP +from ..utils.normalization_layers import ( + RMSNorm, + LayerNorm, + LearnableLayerScaling, + BatchNorm, + InstanceNorm, + GroupNorm, +) +from ..utils.default_mamba_params import DefaultConfig class BaseMambularRegressor(pl.LightningModule): - """ - A base regression module for tabular data built on PyTorch Lightning. It incorporates embeddings - for categorical and numerical features with a configurable architecture provided by MambularConfig. - This module is designed for regression tasks. - - Parameters - ---------- - config : MambularConfig - An instance of MambularConfig containing configuration parameters for the model architecture. - cat_feature_info : dict, optional - A dictionary mapping the names of categorical features to their number of unique categories. Defaults to None. - num_feature_info : dict, optional - A dictionary mapping the names of numerical features to their number of dimensions after embedding. Defaults to None. - lr : float, optional - The initial learning rate for the optimizer. Defaults to 1e-03. - lr_patience : int, optional - The number of epochs with no improvement after which learning rate will be reduced. Defaults to 10. - weight_decay : float, optional - Weight decay (L2 penalty) coefficient. Defaults to 0.025. - lr_factor : float, optional - Factor by which the learning rate will be reduced. Defaults to 0.75. - - - Attributes - ---------- - mamba : Mamba - The core neural network module implementing the Mamba architecture. - norm_f : nn.Module - Normalization layer applied after the Mamba block. - tabular_head : nn.Linear - Final linear layer mapping the features to a single output for regression tasks. - train_mse : torchmetrics.MeanSquaredError - Metric computation module for training Mean Squared Error. - val_mse : torchmetrics.MeanSquaredError - Metric computation module for validation Mean Squared Error. - loss_fct : torch.nn.MSELoss - The loss function for regression tasks. - """ - def __init__( self, - config: MambularConfig, - cat_feature_info: dict = None, - num_feature_info: dict = None, - lr=1e-03, - lr_patience=10, - weight_decay=0.025, - lr_factor=0.75, + cat_feature_info, + num_feature_info, + config: DefaultConfig = DefaultConfig(), + **kwargs, ): super().__init__() - self.config = config - self.lr = lr - self.lr_patience = lr_patience - self.weight_decay = weight_decay - self.lr_factor = lr_factor + # Save all hyperparameters + self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"]) + + # Assigning values from hyperparameters + self.lr = self.hparams.get("lr", config.lr) + self.lr_patience = self.hparams.get("lr_patience", config.lr_patience) + self.weight_decay = self.hparams.get("weight_decay", config.weight_decay) + self.lr_factor = self.hparams.get("lr_factor", config.lr_factor) + self.pooling_method = self.hparams.get("pooling_method", config.pooling_method) self.cat_feature_info = cat_feature_info self.num_feature_info = num_feature_info @@ -75,67 +45,108 @@ def __init__( "selu": nn.SELU(), "gelu": nn.GELU(), "softplus": nn.Softplus(), - "leakyrelu": nn.LeakyReLU(), "linear": nn.Identity(), + "silu": nn.functional.silu, } - self.embedding_activation = activations.get( - self.config.num_embedding_activation.lower() + self.embedding_activation = self.hparams.get( + "num_embedding_activation", config.num_embedding_activation + ) + + # Additional layers and components initialization based on hyperparameters + self.mamba = Mamba( + d_model=self.hparams.get("d_model", config.d_model), + n_layers=self.hparams.get("n_layers", config.n_layers), + expand_factor=self.hparams.get("expand_factor", config.expand_factor), + bias=self.hparams.get("bias", config.bias), + d_conv=self.hparams.get("d_conv", config.d_conv), + conv_bias=self.hparams.get("conv_bias", config.conv_bias), + dropout=self.hparams.get("dropout", config.dropout), + dt_rank=self.hparams.get("dt_rank", config.dt_rank), + d_state=self.hparams.get("d_state", config.d_state), + dt_scale=self.hparams.get("dt_scale", config.dt_scale), + dt_init=self.hparams.get("dt_init", config.dt_init), + dt_max=self.hparams.get("dt_max", config.dt_max), + dt_min=self.hparams.get("dt_min", config.dt_min), + dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor), + norm=globals()[self.hparams.get("norm", config.norm)], + activation=self.hparams.get("activation", config.activation), ) + + # Set the normalization layer dynamically + norm_layer = self.hparams.get("norm", config.norm) + if norm_layer == "RMSNorm": + self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LayerNorm": + self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "BatchNorm": + self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "InstanceNorm": + self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "GroupNorm": + self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LearnableLayerScaling": + self.norm_f = LearnableLayerScaling( + self.hparams.get("d_model", config.d_model) + ) + else: + raise ValueError(f"Unsupported normalization layer: {norm_layer}") + if self.embedding_activation is None: raise ValueError( - f"Unsupported activation function: {self.config.num_embedding_activation}" + f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}" ) self.num_embeddings = nn.ModuleList( [ nn.Sequential( - nn.Linear(input_shape, self.config.d_model, bias=False), - # Example using ReLU as the activation function, change as needed + nn.Linear( + input_shape, + self.hparams.get("d_model", config.d_model), + bias=False, + ), self.embedding_activation, ) for feature_name, input_shape in num_feature_info.items() ] ) - # Create embedding layers for categorical features based on cat_feature_info self.cat_embeddings = nn.ModuleList( [ - nn.Embedding(num_categories + 1, self.config.d_model) + nn.Embedding( + num_categories + 1, self.hparams.get("d_model", config.d_model) + ) for feature_name, num_categories in cat_feature_info.items() ] ) - self.mamba = Mamba(self.config) - self.norm_f = self.config.norm(self.config.d_model) - mlp_activation_fn = activations.get( - self.config.tabular_head_activation.lower(), nn.Identity() - ) - - # Dynamically create MLP layers based on config.tabular_units - mlp_layers = [] - input_dim = self.config.d_model # Initial input dimension - - # Iterate over the specified units for each layer in the MLP - for units in self.config.tabular_head_units: - mlp_layers.append(nn.Linear(input_dim, units)) - mlp_layers.append(mlp_activation_fn) - mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout)) - input_dim = units + head_activation = self.hparams.get("head_activation", config.head_activation) - # Add the final linear layer to map to a single output value - mlp_layers.append(nn.Linear(input_dim, 1)) - - # Combine all layers into a Sequential module - self.tabular_head = nn.Sequential(*mlp_layers) + self.tabular_head = MLP( + self.hparams.get("d_model", config.d_model), + hidden_units_list=self.hparams.get( + "head_layer_sizes", config.head_layer_sizes + ), + dropout_rate=self.hparams.get("head_dropout", config.head_dropout), + use_skip_layers=self.hparams.get( + "head_skip_layers", config.head_skip_layers + ), + activation_fn=head_activation, + use_batch_norm=self.hparams.get( + "head_use_batch_norm", config.head_use_batch_norm + ), + ) - self.pooling_method = self.config.pooling_method - self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model)) + self.cls_token = nn.Parameter( + torch.zeros(1, 1, self.hparams.get("d_model", config.d_model)) + ) self.loss_fct = nn.MSELoss() - if self.config.layer_norm_after_embedding: - self.embedding_norm = nn.LayerNorm(self.config.d_model) + if self.hparams.get("layer_norm_after_embedding"): + self.embedding_norm = nn.LayerNorm( + self.hparams.get("d_model", config.d_model) + ) def forward(self, cat_features, num_features): """ @@ -156,8 +167,7 @@ def forward(self, cat_features, num_features): """ batch_size = ( - cat_features[0].size(0) if cat_features != [ - ] else num_features[0].size(0) + cat_features[0].size(0) if cat_features != [] else num_features[0].size(0) ) cls_tokens = self.cls_token.expand(batch_size, -1, -1) @@ -168,7 +178,7 @@ def forward(self, cat_features, num_features): ] cat_embeddings = torch.stack(cat_embeddings, dim=1) cat_embeddings = torch.squeeze(cat_embeddings, dim=2) - if self.config.layer_norm_after_embedding: + if self.hparams.get("layer_norm_after_embedding"): cat_embeddings = self.embedding_norm(cat_embeddings) else: cat_embeddings = None @@ -179,7 +189,7 @@ def forward(self, cat_features, num_features): emb(num_features[i]) for i, emb in enumerate(self.num_embeddings) ] num_embeddings = torch.stack(num_embeddings, dim=1) - if self.config.layer_norm_after_embedding: + if self.hparams.get("layer_norm_after_embedding"): num_embeddings = self.embedding_norm(num_embeddings) else: num_embeddings = None @@ -209,7 +219,7 @@ def forward(self, cat_features, num_features): else: raise ValueError(f"Invalid pooling method: {self.pooling_method}") - x = self.norm_f(x) + x = self.norm_f.forward(x) preds = self.tabular_head(x) return preds @@ -281,7 +291,7 @@ def configure_optimizers(self): A dictionary containing the optimizer and lr_scheduler configurations. """ optimizer = torch.optim.Adam( - self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay + self.parameters(), lr=self.lr, weight_decay=self.weight_decay ) scheduler = { "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau( diff --git a/mambular/models/sklearn_regressor.py b/mambular/models/sklearn_regressor.py index ad23fd6..c75f5d8 100644 --- a/mambular/models/sklearn_regressor.py +++ b/mambular/models/sklearn_regressor.py @@ -6,30 +6,94 @@ from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from torch.utils.data import DataLoader +import warnings from ..base_models.regressor import BaseMambularRegressor -from ..utils.config import MambularConfig from ..utils.dataset import MambularDataModule, MambularDataset from ..utils.preprocessor import Preprocessor +from ..utils.default_mamba_params import DefaultConfig class MambularRegressor(BaseEstimator): """ - A regressor implemented using PyTorch Lightning that follows the scikit-learn API conventions. This class is designed - to work with tabular data, offering a straightforward way to specify model configurations and preprocessing steps. It - integrates seamlessly with scikit-learn's tools such as cross-validation and grid search. + A regressor implemented using PyTorch Lightning that follows the scikit-learn API conventions. + This class is designed to work with tabular data, offering a straightforward way to specify + model configurations and preprocessing steps. It integrates seamlessly with scikit-learn's tools + such as cross-validation and grid search. Parameters ---------- - **kwargs : Various - Accepts any number of keyword arguments. Arguments recognized as model configuration options are passed to the - MambularConfig constructor. Remaining arguments are assumed to be preprocessor options and passed to the - Preprocessor constructor. + # configuration parameters + lr : float, optional + Learning rate for the optimizer. Default is 1e-4. + lr_patience : int, optional + Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10. + weight_decay : float, optional + Weight decay (L2 penalty) coefficient. Default is 1e-6. + lr_factor : float, optional + Factor by which the learning rate will be reduced. Default is 0.1. + d_model : int, optional + Dimension of the model. Default is 64. + n_layers : int, optional + Number of layers. Default is 8. + expand_factor : int, optional + Expansion factor. Default is 2. + bias : bool, optional + Whether to use bias. Default is False. + d_conv : int, optional + Dimension of the convolution. Default is 16. + conv_bias : bool, optional + Whether to use bias in the convolution. Default is True. + dropout : float, optional + Dropout rate in the mamba blocks. Default is 0.05. + dt_rank : str, optional + Rank of the time dimension. Default is "auto". + d_state : int, optional + State dimension. Default is 16. + dt_scale : float, optional + Scale of the time dimension. Default is 1.0. + dt_init : str, optional + Initialization method for the time dimension. Default is "random". + dt_max : float, optional + Maximum value for the time dimension. Default is 0.1. + dt_min : float, optional + Minimum value for the time dimension. Default is 1e-3. + dt_init_floor : float, optional + Floor value for the time dimension initialization. Default is 1e-4. + norm : str, optional + Normalization method. Default is 'RMSNorm'. + activation : callable, optional + Activation function. Default is nn.SELU(). + num_embedding_activation : callable, optional + Activation function for numerical embeddings. Default is nn.Identity(). + head_layer_sizes : list, optional + Sizes of the layers in the head. Default is [64, 64, 32]. + head_dropout : float, optional + Dropout rate for the head. Default is 0.5. + head_skip_layers : bool, optional + Whether to use skip layers in the head. Default is False. + head_activation : callable, optional + Activation function for the head. Default is nn.SELU(). + head_use_batch_norm : bool, optional + Whether to use batch normalization in the head. Default is False. + + # Preprocessor Parameters + n_bins : int, optional + The number of bins to use for numerical feature binning. Default is 50. + numerical_preprocessing : str, optional + The preprocessing strategy for numerical features. Default is 'ple'. + use_decision_tree_bins : bool, optional + If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False. + binning_strategy : str, optional + Defines the strategy for binning numerical features. Default is 'uniform'. + task : str, optional + Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'. + Attributes ---------- - config : MambularConfig + config : DefaultConfig An object storing the configuration settings for the model. preprocessor : Preprocessor An object responsible for preprocessing the input data, such as encoding categorical variables and scaling numerical features. @@ -39,44 +103,60 @@ class MambularRegressor(BaseEstimator): def __init__(self, **kwargs): # Known config arguments - print("Received kwargs:", kwargs) config_arg_names = [ + "lr", + "lr_patience", + "weight_decay", + "lr_factor", "d_model", "n_layers", - "dt_rank", - "output_dimension", - "pooling_method", - "norm", - "cls", - "dt_min", - "dt_max", - "dropout", + "expand_factor", "bias", - "weight_decay", + "d_conv", "conv_bias", + "dropout", + "dt_rank", "d_state", - "expand_factor", - "d_conv", - "dt_init", "dt_scale", + "dt_init", + "dt_max", + "dt_min", "dt_init_floor", - "tabular_head_units", - "tabular_head_activation", - "tabular_head_dropout", - "num_emebedding_activation", - "layer_norm_after_embedding", + "norm", + "activation", + "num_embedding_activation", + "head_layer_sizes", + "head_dropout", + "head_skip_layers", + "head_activation", + "head_use_batch_norm", + ] + + preprocessor_arg_names = [ + "n_bins", + "numerical_preprocessing", + "use_decision_tree_bins", + "binning_strategy", + "task", ] - self.config_kwargs = {k: v for k, - v in kwargs.items() if k in config_arg_names} - self.config = MambularConfig(**self.config_kwargs) - # The rest are assumed to be preprocessor arguments + self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names} + self.config = DefaultConfig(**self.config_kwargs) + preprocessor_kwargs = { - k: v for k, v in kwargs.items() if k not in config_arg_names + k: v for k, v in kwargs.items() if k in preprocessor_arg_names } + self.preprocessor = Preprocessor(**preprocessor_kwargs) self.model = None + # Raise a warning if task is set to 'classification' + if preprocessor_kwargs.get("task") == "classification": + warnings.warn( + "The task is set to 'classification'. MambularRegressor is designed for regression tasks.", + UserWarning, + ) + def get_params(self, deep=True): """ Get parameters for this estimator. Overrides the BaseEstimator method. @@ -86,13 +166,12 @@ def get_params(self, deep=True): deep : bool, default=True If True, returns the parameters for this estimator and contained sub-objects that are estimators. - Returns ------- params : dict Parameter names mapped to their values. """ - params = self.config_kwargs # Parameters used to initialize MambularConfig + params = self.config_kwargs # Parameters used to initialize DefaultConfig # If deep=True, include parameters from nested components like preprocessor if deep: @@ -114,7 +193,6 @@ def set_params(self, **parameters): **parameters : dict Estimator parameters to be set. - Returns ------- self : object @@ -122,8 +200,7 @@ def set_params(self, **parameters): """ # Update config_kwargs with provided parameters valid_config_keys = self.config_kwargs.keys() - config_updates = {k: v for k, - v in parameters.items() if k in valid_config_keys} + config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys} self.config_kwargs.update(config_updates) # Update the config object @@ -194,8 +271,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): data_module : MambularDataModule An instance of MambularDataModule containing the training and validation DataLoaders. """ - train_preprocessed_data = self.preprocessor.fit_transform( - X_train, y_train) + train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train) val_preprocessed_data = self.preprocessor.transform(X_val) # Update feature info based on the actual processed data @@ -215,26 +291,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): cat_key = "cat_" + key # Assuming categorical keys are prefixed with 'cat_' if cat_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long) ) if cat_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long) ) binned_key = "num_" + key # for binned features if binned_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long) ) if binned_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long) ) # Populate tensors for numerical features, if present in processed data @@ -242,13 +314,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): num_key = "num_" + key # Assuming numerical keys are prefixed with 'num_' if num_key in train_preprocessed_data: train_num_tensors.append( - torch.tensor( - train_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(train_preprocessed_data[num_key], dtype=torch.float) ) if num_key in val_preprocessed_data: val_num_tensors.append( - torch.tensor( - val_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(val_preprocessed_data[num_key], dtype=torch.float) ) train_labels = torch.tensor(y_train, dtype=torch.float) @@ -258,8 +328,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): train_dataset = MambularDataset( train_cat_tensors, train_num_tensors, train_labels ) - val_dataset = MambularDataset( - val_cat_tensors, val_num_tensors, val_labels) + val_dataset = MambularDataset(val_cat_tensors, val_num_tensors, val_labels) # Create dataloaders train_dataloader = DataLoader( @@ -320,20 +389,20 @@ def fit( self, X, y, - val_size=0.2, + val_size: float = 0.2, X_val=None, y_val=None, - max_epochs=100, - random_state=101, - batch_size=128, - shuffle=True, - patience=10, - monitor="val_loss", - mode="min", - lr=1e-3, - lr_patience=5, - factor=0.75, - weight_decay=1e-06, + max_epochs: int = 100, + random_state: int = 101, + batch_size: int = 128, + shuffle: bool = True, + patience: int = 15, + monitor: str = "val_loss", + mode: str = "min", + lr: float = 1e-4, + lr_patience: int = 10, + factor: float = 0.1, + weight_decay: float = 1e-06, **trainer_kwargs ): """ @@ -369,7 +438,7 @@ def fit( Learning rate for the optimizer. lr_patience : int, default=10 Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. - factor : float, default=0.75 + factor : float, default=0.1 Factor by which the learning rate will be reduced. weight_decay : float, default=0.025 Weight decay (L2 penalty) coefficient. diff --git a/mambular/utils/default_mamba_params.py b/mambular/utils/default_mamba_params.py new file mode 100644 index 0000000..f46e4c4 --- /dev/null +++ b/mambular/utils/default_mamba_params.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +import torch.nn as nn + + +@dataclass +class DefaultConfig: + lr: float = 1e-04 + lr_patience: int = 10 + weight_decay: float = 1e-06 + lr_factor: float = 0.1 + d_model: int = 64 + n_layers: int = 8 + expand_factor: int = 2 + bias: bool = False + d_conv: int = 16 + conv_bias: bool = True + dropout: float = 0.05 + dt_rank: str = "auto" + d_state: int = 32 + dt_scale: float = 1.0 + dt_init: str = "random" + dt_max: float = 0.1 + dt_min: float = 1e-04 + dt_init_floor: float = 1e-04 + norm: str = "RMSNorm" + activation: callable = nn.SELU() + num_embedding_activation: callable = nn.Identity() + head_layer_sizes: list = (128, 64, 32) + head_dropout: float = 0.5 + head_skip_layers: bool = False + head_activation: callable = nn.SELU() + head_use_batch_norm: bool = (False,) + layer_norm_after_embedding: bool = (False,) + pooling_method: str = "avg" diff --git a/mambular/utils/mamba_arch.py b/mambular/utils/mamba_arch.py index 2e7ca7c..a1eb830 100644 --- a/mambular/utils/mamba_arch.py +++ b/mambular/utils/mamba_arch.py @@ -2,7 +2,14 @@ import torch import torch.nn as nn import torch.nn.functional as F -from .config import MambularConfig +from .normalization_layers import ( + RMSNorm, + LayerNorm, + LearnableLayerScaling, + BatchNorm, + InstanceNorm, + GroupNorm, +) ### Heavily inspired and mostly taken from https://github.com/alxndrTL/mamba.py @@ -16,13 +23,48 @@ class Mamba(nn.Module): layers (nn.ModuleList): List of MambaBlocks constituting the model. """ - def __init__(self, config: MambularConfig): + def __init__( + self, + d_model=32, + n_layers=8, + expand_factor=2, + bias=False, + d_conv=8, + conv_bias=True, + dropout=0.01, + dt_rank="auto", + d_state=16, + dt_scale=1.0, + dt_init="random", + dt_max=0.1, + dt_min=1e-03, + dt_init_floor=1e-04, + norm=RMSNorm, + activation=F.silu, + ): super().__init__() - self.config = config - self.layers = nn.ModuleList( - [ResidualBlock(config) for _ in range(config.n_layers)] + [ + ResidualBlock( + d_model, + expand_factor, + bias, + d_conv, + conv_bias, + dropout, + dt_rank, + d_state, + dt_scale, + dt_init, + dt_max, + dt_min, + dt_init_floor, + norm, + activation, + ) + for _ in range(n_layers) + ] ) def forward(self, x): @@ -40,11 +82,67 @@ class ResidualBlock(nn.Module): norm (RMSNorm): Normalization layer. """ - def __init__(self, config: MambularConfig): + def __init__( + self, + d_model=32, + expand_factor=2, + bias=False, + d_conv=16, + conv_bias=True, + dropout=0.01, + dt_rank="auto", + d_state=32, + dt_scale=1.0, + dt_init="random", + dt_max=0.1, + dt_min=1e-03, + dt_init_floor=1e-04, + norm=RMSNorm, + activation=F.silu, + ): super().__init__() - self.layers = MambaBlock(config) - self.norm = config.norm(config.d_model) + VALID_NORMALIZATION_LAYERS = { + "RMSNorm": RMSNorm, + "LayerNorm": LayerNorm, + "LearnableLayerScaling": LearnableLayerScaling, + "BatchNorm": BatchNorm, + "InstanceNorm": InstanceNorm, + "GroupNorm": GroupNorm, + } + + # Check if the provided normalization layer is valid + if isinstance(norm, type) and norm.__name__ not in VALID_NORMALIZATION_LAYERS: + raise ValueError( + f"Invalid normalization layer: {norm.__name__}. " + f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}" + ) + elif isinstance(norm, str) and norm not in self.VALID_NORMALIZATION_LAYERS: + raise ValueError( + f"Invalid normalization layer: {norm}. " + f"Valid options are: {', '.join(VALID_NORMALIZATION_LAYERS.keys())}" + ) + + if dt_rank == "auto": + dt_rank = math.ceil(d_model / 16) + + self.layers = MambaBlock( + d_model=d_model, + expand_factor=expand_factor, + bias=bias, + d_conv=d_conv, + conv_bias=conv_bias, + dropout=dropout, + dt_rank=dt_rank, + d_state=d_state, + dt_scale=dt_scale, + dt_init=dt_init, + dt_max=dt_max, + dt_min=dt_min, + dt_init_floor=dt_init_floor, + activation=activation, + ) + self.norm = norm(d_model) def forward(self, x): output = self.layers(self.norm(x)) + x @@ -65,53 +163,66 @@ class MambaBlock(nn.Module): out_proj (nn.Linear): Linear projection for output. """ - def __init__(self, config: MambularConfig): + def __init__( + self, + d_model=32, + expand_factor=2, + bias=False, + d_conv=16, + conv_bias=True, + dropout=0.01, + dt_rank="auto", + d_state=32, + dt_scale=1.0, + dt_init="random", + dt_max=0.1, + dt_min=1e-03, + dt_init_floor=1e-04, + activation=F.silu, + ): super().__init__() + self.d_inner = d_model * expand_factor - self.config = config - - self.in_proj = nn.Linear(config.d_model, 2 * config.d_inner, bias=config.bias) + self.in_proj = nn.Linear(d_model, 2 * self.d_inner, bias=bias) self.conv1d = nn.Conv1d( - in_channels=config.d_inner, - out_channels=config.d_inner, - kernel_size=config.d_conv, - bias=config.conv_bias, - groups=config.d_inner, - padding=config.d_conv - 1, + in_channels=self.d_inner, + out_channels=self.d_inner, + kernel_size=d_conv, + bias=conv_bias, + groups=self.d_inner, + padding=d_conv - 1, ) - self.dropout = nn.Dropout(config.dropout) + self.dropout = nn.Dropout(dropout) + self.activation = activation - self.x_proj = nn.Linear( - config.d_inner, config.dt_rank + 2 * config.d_state, bias=False - ) + self.x_proj = nn.Linear(self.d_inner, dt_rank + 2 * d_state, bias=False) - self.dt_proj = nn.Linear(config.dt_rank, config.d_inner, bias=True) + self.dt_proj = nn.Linear(dt_rank, self.d_inner, bias=True) - dt_init_std = config.dt_rank**-0.5 * config.dt_scale - if config.dt_init == "constant": + dt_init_std = dt_rank**-0.5 * dt_scale + if dt_init == "constant": nn.init.constant_(self.dt_proj.weight, dt_init_std) - elif config.dt_init == "random": + elif dt_init == "random": nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std) else: raise NotImplementedError dt = torch.exp( - torch.rand(config.d_inner) - * (math.log(config.dt_max) - math.log(config.dt_min)) - + math.log(config.dt_min) - ).clamp(min=config.dt_init_floor) + torch.rand(self.d_inner) * (math.log(dt_max) - math.log(dt_min)) + + math.log(dt_min) + ).clamp(min=dt_init_floor) inv_dt = dt + torch.log(-torch.expm1(-dt)) with torch.no_grad(): self.dt_proj.bias.copy_(inv_dt) - A = torch.arange(1, config.d_state + 1, dtype=torch.float32).repeat( - config.d_inner, 1 - ) + A = torch.arange(1, d_state + 1, dtype=torch.float32).repeat(self.d_inner, 1) self.A_log = nn.Parameter(torch.log(A)) - self.D = nn.Parameter(torch.ones(config.d_inner)) - self.out_proj = nn.Linear(config.d_inner, config.d_model, bias=config.bias) + self.D = nn.Parameter(torch.ones(self.d_inner)) + self.out_proj = nn.Linear(self.d_inner, d_model, bias=bias) + self.dt_rank = dt_rank + self.d_state = d_state def forward(self, x): _, L, _ = x.shape @@ -123,11 +234,11 @@ def forward(self, x): x = self.conv1d(x)[:, :, :L] x = x.transpose(1, 2) - x = F.silu(x) + x = self.activation(x) x = self.dropout(x) y = self.ssm(x) - z = F.silu(z) + z = self.activation(z) z = self.dropout(z) output = y * z @@ -143,7 +254,7 @@ def ssm(self, x): delta, B, C = torch.split( deltaBC, - [self.config.dt_rank, self.config.d_state, self.config.d_state], + [self.dt_rank, self.d_state, self.d_state], dim=-1, ) delta = F.softplus(self.dt_proj(delta)) @@ -160,9 +271,7 @@ def selective_scan_seq(self, x, delta, A, B, C, D): BX = deltaB * (x.unsqueeze(-1)) - h = torch.zeros( - x.size(0), self.config.d_inner, self.config.d_state, device=deltaA.device - ) + h = torch.zeros(x.size(0), self.d_inner, self.d_state, device=deltaA.device) hs = [] for t in range(0, L): diff --git a/mambular/utils/normalization_layers.py b/mambular/utils/normalization_layers.py index 817a2cd..5237177 100644 --- a/mambular/utils/normalization_layers.py +++ b/mambular/utils/normalization_layers.py @@ -15,7 +15,6 @@ class RMSNorm(nn.Module): def __init__(self, d_model: int, eps: float = 1e-5): super().__init__() - self.eps = eps self.weight = nn.Parameter(torch.ones(d_model)) diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py index 767e4f1..c443ed1 100644 --- a/mambular/utils/preprocessor.py +++ b/mambular/utils/preprocessor.py @@ -53,8 +53,8 @@ class Preprocessor: def __init__( self, - n_bins=200, - numerical_preprocessing="binning", + n_bins=50, + numerical_preprocessing="ple", use_decision_tree_bins=False, binning_strategy="uniform", task="regression", From 19fd5ea3c20337fae4ea1fc10630c60a6d0deab0 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 12:13:36 +0000 Subject: [PATCH 05/21] include configs in single file --- mambular/utils/configs.py | 57 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 mambular/utils/configs.py diff --git a/mambular/utils/configs.py b/mambular/utils/configs.py new file mode 100644 index 0000000..821d23e --- /dev/null +++ b/mambular/utils/configs.py @@ -0,0 +1,57 @@ +from dataclasses import dataclass +import torch.nn as nn + + +@dataclass +class DefaultMambularConfig: + lr: float = 1e-04 + lr_patience: int = 10 + weight_decay: float = 1e-06 + lr_factor: float = 0.1 + d_model: int = 64 + n_layers: int = 8 + expand_factor: int = 2 + bias: bool = False + d_conv: int = 16 + conv_bias: bool = True + dropout: float = 0.05 + dt_rank: str = "auto" + d_state: int = 32 + dt_scale: float = 1.0 + dt_init: str = "random" + dt_max: float = 0.1 + dt_min: float = 1e-04 + dt_init_floor: float = 1e-04 + norm: str = "RMSNorm" + activation: callable = nn.SELU() + num_embedding_activation: callable = nn.Identity() + head_layer_sizes: list = (128, 64, 32) + head_dropout: float = 0.5 + head_skip_layers: bool = False + head_activation: callable = nn.SELU() + head_use_batch_norm: bool = (False,) + layer_norm_after_embedding: bool = (False,) + pooling_method: str = "avg" + + +@dataclass +class DefaultFTTransformerConfig: + lr: float = 1e-04 + lr_patience: int = 10 + weight_decay: float = 1e-06 + lr_factor: float = 0.1 + d_model: int = 64 + n_layers: int = 8 + n_heads: int = 4 + attn_dropout: float = 0.3 + ff_dropout: float = 0.3 + norm: str = "RMSNorm" + activation: callable = nn.SELU() + num_embedding_activation: callable = nn.Identity() + head_layer_sizes: list = (128, 64, 32) + head_dropout: float = 0.5 + head_skip_layers: bool = False + head_activation: callable = nn.SELU() + head_use_batch_norm: bool = (False,) + layer_norm_after_embedding: bool = (False,) + pooling_method: str = "avg" From 339cd6eaa5dfa6ed5c41acf4eb60e0c811b69755 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 12:13:46 +0000 Subject: [PATCH 06/21] delete current helper default params --- mambular/utils/default_mamba_params.py | 34 -------------------------- 1 file changed, 34 deletions(-) delete mode 100644 mambular/utils/default_mamba_params.py diff --git a/mambular/utils/default_mamba_params.py b/mambular/utils/default_mamba_params.py deleted file mode 100644 index f46e4c4..0000000 --- a/mambular/utils/default_mamba_params.py +++ /dev/null @@ -1,34 +0,0 @@ -from dataclasses import dataclass -import torch.nn as nn - - -@dataclass -class DefaultConfig: - lr: float = 1e-04 - lr_patience: int = 10 - weight_decay: float = 1e-06 - lr_factor: float = 0.1 - d_model: int = 64 - n_layers: int = 8 - expand_factor: int = 2 - bias: bool = False - d_conv: int = 16 - conv_bias: bool = True - dropout: float = 0.05 - dt_rank: str = "auto" - d_state: int = 32 - dt_scale: float = 1.0 - dt_init: str = "random" - dt_max: float = 0.1 - dt_min: float = 1e-04 - dt_init_floor: float = 1e-04 - norm: str = "RMSNorm" - activation: callable = nn.SELU() - num_embedding_activation: callable = nn.Identity() - head_layer_sizes: list = (128, 64, 32) - head_dropout: float = 0.5 - head_skip_layers: bool = False - head_activation: callable = nn.SELU() - head_use_batch_norm: bool = (False,) - layer_norm_after_embedding: bool = (False,) - pooling_method: str = "avg" From fc4c5de07316aa5fe31cb4307786262682bee3ff Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 12:13:57 +0000 Subject: [PATCH 07/21] delete former activation mapping --- mambular/base_models/regressor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mambular/base_models/regressor.py b/mambular/base_models/regressor.py index 2ff671c..34d5283 100644 --- a/mambular/base_models/regressor.py +++ b/mambular/base_models/regressor.py @@ -11,7 +11,7 @@ InstanceNorm, GroupNorm, ) -from ..utils.default_mamba_params import DefaultConfig +from ..utils.configs import DefaultMambularConfig class BaseMambularRegressor(pl.LightningModule): @@ -19,7 +19,7 @@ def __init__( self, cat_feature_info, num_feature_info, - config: DefaultConfig = DefaultConfig(), + config: DefaultMambularConfig = DefaultMambularConfig(), **kwargs, ): super().__init__() From dee7c0ae716867b499ca1baad062d9c041204007 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 12:14:03 +0000 Subject: [PATCH 08/21] adjust config import --- mambular/models/sklearn_regressor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mambular/models/sklearn_regressor.py b/mambular/models/sklearn_regressor.py index c75f5d8..6a1ca1b 100644 --- a/mambular/models/sklearn_regressor.py +++ b/mambular/models/sklearn_regressor.py @@ -11,7 +11,7 @@ from ..base_models.regressor import BaseMambularRegressor from ..utils.dataset import MambularDataModule, MambularDataset from ..utils.preprocessor import Preprocessor -from ..utils.default_mamba_params import DefaultConfig +from ..utils.configs import DefaultMambularConfig class MambularRegressor(BaseEstimator): @@ -141,7 +141,7 @@ def __init__(self, **kwargs): ] self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names} - self.config = DefaultConfig(**self.config_kwargs) + self.config = DefaultMambularConfig(**self.config_kwargs) preprocessor_kwargs = { k: v for k, v in kwargs.items() if k in preprocessor_arg_names From 9e71ddbe671e2f568cecea0535d5f90dfe852821 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 14:21:29 +0000 Subject: [PATCH 09/21] include documentation --- mambular/base_models/regressor.py | 65 ++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/mambular/base_models/regressor.py b/mambular/base_models/regressor.py index 34d5283..4171042 100644 --- a/mambular/base_models/regressor.py +++ b/mambular/base_models/regressor.py @@ -15,6 +15,56 @@ class BaseMambularRegressor(pl.LightningModule): + """ + A PyTorch Lightning Module for regression tasks utilizing the Mamba architecture and various normalization techniques. + + Parameters + ---------- + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + config : DefaultMambularConfig, optional + Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()). + **kwargs : dict + Additional keyword arguments. + + Attributes + ---------- + lr : float + Learning rate. + lr_patience : int + Patience for learning rate scheduler. + weight_decay : float + Weight decay for optimizer. + lr_factor : float + Factor by which the learning rate will be reduced. + pooling_method : str + Method to pool the features. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + embedding_activation : callable + Activation function for embeddings. + mamba : Mamba + Mamba architecture component. + norm_f : nn.Module + Normalization layer. + num_embeddings : nn.ModuleList + Module list for numerical feature embeddings. + cat_embeddings : nn.ModuleList + Module list for categorical feature embeddings. + tabular_head : MLP + Multi-layer perceptron head for tabular data. + cls_token : nn.Parameter + Class token parameter. + loss_fct : nn.Module + Loss function. + embedding_norm : nn.Module, optional + Layer normalization applied after embedding if specified. + """ + def __init__( self, cat_feature_info, @@ -36,19 +86,6 @@ def __init__( self.cat_feature_info = cat_feature_info self.num_feature_info = num_feature_info - activations = { - "relu": nn.ReLU(), - "tanh": nn.Tanh(), - "sigmoid": nn.Sigmoid(), - "leaky_relu": nn.LeakyReLU(), - "elu": nn.ELU(), - "selu": nn.SELU(), - "gelu": nn.GELU(), - "softplus": nn.Softplus(), - "linear": nn.Identity(), - "silu": nn.functional.silu, - } - self.embedding_activation = self.hparams.get( "num_embedding_activation", config.num_embedding_activation ) @@ -219,7 +256,7 @@ def forward(self, cat_features, num_features): else: raise ValueError(f"Invalid pooling method: {self.pooling_method}") - x = self.norm_f.forward(x) + x = self.norm_f(x) preds = self.tabular_head(x) return preds From 8d9eebcf823be27f9c97163c2de16af3ee71ffde Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 14:21:43 +0000 Subject: [PATCH 10/21] adapt to new config + fix prediction for binary --- mambular/base_models/classifier.py | 227 ++++++++++++++++++----------- 1 file changed, 141 insertions(+), 86 deletions(-) diff --git a/mambular/base_models/classifier.py b/mambular/base_models/classifier.py index 3fe299d..8c818a7 100644 --- a/mambular/base_models/classifier.py +++ b/mambular/base_models/classifier.py @@ -2,9 +2,17 @@ import torch import torch.nn as nn import torchmetrics - -from ..utils.config import MambularConfig from ..utils.mamba_arch import Mamba +from ..utils.mlp_utils import MLP +from ..utils.normalization_layers import ( + RMSNorm, + LayerNorm, + LearnableLayerScaling, + BatchNorm, + InstanceNorm, + GroupNorm, +) +from ..utils.configs import DefaultMambularConfig class BaseMambularClassifier(pl.LightningModule): @@ -17,41 +25,49 @@ class BaseMambularClassifier(pl.LightningModule): Parameters ---------- num_classes : int - The number of classes in the classification task. For binary classification, this should be 2. - config : MambularConfig - An instance of MambularConfig containing configuration parameters for the Mambular model. - cat_feature_info : dict, optional - A dictionary mapping the names of categorical features to their number of unique categories. - This information is used to configure embedding layers for categorical features. Defaults to None. - num_feature_info : dict, optional - A dictionary mapping the names of numerical features to the size of their input dimensions. - This information is used to configure embedding layers for numerical features. Defaults to None. - lr : float, optional - The learning rate for the optimizer. Defaults to 1e-03. - lr_patience : int, optional - The number of epochs with no improvement after which learning rate will be reduced. Defaults to 10. - weight_decay : float, optional - Weight decay (L2 penalty) parameter for the optimizer. Defaults to 0.025. - lr_factor : float, optional - Factor by which the learning rate will be reduced. Defaults to 0.75. - - - Attributes + number of classes for classification. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + config : DefaultMambularConfig, optional + Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()). + **kwargs : dict + Additional keyword arguments. + + + Attributes ---------- - embedding_activation : nn.Module - The activation function to be applied after the linear transformation of numerical features. - num_embeddings : nn.ModuleList - A list of sequential modules, each corresponding to an embedding layer for a numerical feature. - cat_embeddings : nn.ModuleList - A list of embedding layers, each corresponding to a categorical feature. + lr : float + Learning rate. + lr_patience : int + Patience for learning rate scheduler. + weight_decay : float + Weight decay for optimizer. + lr_factor : float + Factor by which the learning rate will be reduced. + pooling_method : str + Method to pool the features. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + embedding_activation : callable + Activation function for embeddings. mamba : Mamba - The Mambular model for processing sequences of embeddings. + Mamba architecture component. norm_f : nn.Module - A normalization layer applied after the Mambular model. - tabular_head : nn.Linear - A linear layer for predicting the class labels from the aggregated embedding representation. - pooling_method : str - The method used to aggregate embeddings across features. Supported methods are 'avg', 'max', and 'sum'. + Normalization layer. + num_embeddings : nn.ModuleList + Module list for numerical feature embeddings. + cat_embeddings : nn.ModuleList + Module list for categorical feature embeddings. + tabular_head : MLP + Multi-layer perceptron head for tabular data. + cls_token : nn.Parameter + Class token parameter. + embedding_norm : nn.Module, optional + Layer normalization applied after embedding if specified. loss_fct : nn.Module The loss function used for training the model, configured based on the number of classes. acc : torchmetrics.Accuracy @@ -66,90 +82,125 @@ class BaseMambularClassifier(pl.LightningModule): def __init__( self, num_classes, - config: MambularConfig, - cat_feature_info: dict = None, - num_feature_info: dict = None, - lr=1e-03, - lr_patience=10, - weight_decay=0.025, - lr_factor=0.75, + cat_feature_info, + num_feature_info, + config: DefaultMambularConfig = DefaultMambularConfig(), + **kwargs, ): super().__init__() - self.config = config self.num_classes = 1 if num_classes == 2 else num_classes - self.lr = lr - self.lr_patience = lr_patience - self.weight_decay = weight_decay - self.lr_factor = lr_factor + # Save all hyperparameters + self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"]) + + # Assigning values from hyperparameters + self.lr = self.hparams.get("lr", config.lr) + self.lr_patience = self.hparams.get("lr_patience", config.lr_patience) + self.weight_decay = self.hparams.get("weight_decay", config.weight_decay) + self.lr_factor = self.hparams.get("lr_factor", config.lr_factor) + self.pooling_method = self.hparams.get("pooling_method", config.pooling_method) self.cat_feature_info = cat_feature_info self.num_feature_info = num_feature_info - activations = { - "relu": nn.ReLU(), - "tanh": nn.Tanh(), - "sigmoid": nn.Sigmoid(), - "leaky_relu": nn.LeakyReLU(), - "elu": nn.ELU(), - "selu": nn.SELU(), - "gelu": nn.GELU(), - "softplus": nn.Softplus(), - "leakyrelu": nn.LeakyReLU(), - "linear": nn.Identity(), - } + self.embedding_activation = self.hparams.get( + "num_embedding_activation", config.num_embedding_activation + ) - self.embedding_activation = activations.get( - self.config.num_embedding_activation.lower() + # Additional layers and components initialization based on hyperparameters + self.mamba = Mamba( + d_model=self.hparams.get("d_model", config.d_model), + n_layers=self.hparams.get("n_layers", config.n_layers), + expand_factor=self.hparams.get("expand_factor", config.expand_factor), + bias=self.hparams.get("bias", config.bias), + d_conv=self.hparams.get("d_conv", config.d_conv), + conv_bias=self.hparams.get("conv_bias", config.conv_bias), + dropout=self.hparams.get("dropout", config.dropout), + dt_rank=self.hparams.get("dt_rank", config.dt_rank), + d_state=self.hparams.get("d_state", config.d_state), + dt_scale=self.hparams.get("dt_scale", config.dt_scale), + dt_init=self.hparams.get("dt_init", config.dt_init), + dt_max=self.hparams.get("dt_max", config.dt_max), + dt_min=self.hparams.get("dt_min", config.dt_min), + dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor), + norm=globals()[self.hparams.get("norm", config.norm)], + activation=self.hparams.get("activation", config.activation), ) + + # Set the normalization layer dynamically + norm_layer = self.hparams.get("norm", config.norm) + if norm_layer == "RMSNorm": + self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LayerNorm": + self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "BatchNorm": + self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "InstanceNorm": + self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "GroupNorm": + self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LearnableLayerScaling": + self.norm_f = LearnableLayerScaling( + self.hparams.get("d_model", config.d_model) + ) + else: + raise ValueError(f"Unsupported normalization layer: {norm_layer}") + if self.embedding_activation is None: raise ValueError( - f"Unsupported activation function: {self.config.num_embedding_activation}" + f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}" ) self.num_embeddings = nn.ModuleList( [ nn.Sequential( - nn.Linear(input_shape, self.config.d_model, bias=False), - nn.BatchNorm1d(self.config.d_model), - # Example using ReLU as the activation function, change as needed + nn.Linear( + input_shape, + self.hparams.get("d_model", config.d_model), + bias=False, + ), self.embedding_activation, ) for feature_name, input_shape in num_feature_info.items() ] ) - # Create embedding layers for categorical features based on cat_feature_info self.cat_embeddings = nn.ModuleList( [ - nn.Embedding(num_categories + 1, self.config.d_model) + nn.Embedding( + num_categories + 1, self.hparams.get("d_model", config.d_model) + ) for feature_name, num_categories in cat_feature_info.items() ] ) - self.mamba = Mamba(self.config) - self.norm_f = self.config.norm(self.config.d_model) + head_activation = self.hparams.get("head_activation", config.head_activation) - mlp_activation_fn = activations.get( - self.config.tabular_head_activation.lower(), nn.Identity() + self.tabular_head = MLP( + self.hparams.get("d_model", config.d_model), + hidden_units_list=self.hparams.get( + "head_layer_sizes", config.head_layer_sizes + ), + dropout_rate=self.hparams.get("head_dropout", config.head_dropout), + use_skip_layers=self.hparams.get( + "head_skip_layers", config.head_skip_layers + ), + activation_fn=head_activation, + use_batch_norm=self.hparams.get( + "head_use_batch_norm", config.head_use_batch_norm + ), + n_output_units=self.num_classes, ) - mlp_layers = [] - input_dim = self.config.d_model # Initial input dimension - - # Iterate over the specified units for each layer in the MLP - for units in self.config.tabular_head_units: - mlp_layers.append(nn.Linear(input_dim, units)) - mlp_layers.append(mlp_activation_fn) - mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout)) - input_dim = units - # Add the final linear layer to map to a single output value - mlp_layers.append(nn.Linear(input_dim, self.num_classes)) + self.cls_token = nn.Parameter( + torch.zeros(1, 1, self.hparams.get("d_model", config.d_model)) + ) - # Combine all layers into a Sequential module - self.tabular_head = nn.Sequential(*mlp_layers) + self.loss_fct = nn.MSELoss() - self.pooling_method = self.config.pooling_method - self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model)) + if self.hparams.get("layer_norm_after_embedding"): + self.embedding_norm = nn.LayerNorm( + self.hparams.get("d_model", config.d_model) + ) if self.num_classes > 2: self.loss_fct = nn.CrossEntropyLoss() @@ -199,6 +250,8 @@ def forward(self, cat_features, num_features): ] cat_embeddings = torch.stack(cat_embeddings, dim=1) cat_embeddings = torch.squeeze(cat_embeddings, dim=2) + if self.hparams.get("layer_norm_after_embedding"): + cat_embeddings = self.embedding_norm(cat_embeddings) else: cat_embeddings = None @@ -208,6 +261,8 @@ def forward(self, cat_features, num_features): emb(num_features[i]) for i, emb in enumerate(self.num_embeddings) ] num_embeddings = torch.stack(num_embeddings, dim=1) + if self.hparams.get("layer_norm_after_embedding"): + num_embeddings = self.embedding_norm(num_embeddings) else: num_embeddings = None @@ -358,7 +413,7 @@ def configure_optimizers(self): A dictionary containing the optimizer and lr_scheduler configurations. """ optimizer = torch.optim.Adam( - self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay + self.parameters(), lr=self.lr, weight_decay=self.weight_decay ) scheduler = { "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau( From b83ddacf30da48f236fc25f272e6fcb08b809957 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 14:27:19 +0000 Subject: [PATCH 11/21] adjust sklearn wrapper classifier --- mambular/models/sklearn_classifier.py | 200 ++++++++++++++++++-------- 1 file changed, 141 insertions(+), 59 deletions(-) diff --git a/mambular/models/sklearn_classifier.py b/mambular/models/sklearn_classifier.py index dbfce8a..bc6626e 100644 --- a/mambular/models/sklearn_classifier.py +++ b/mambular/models/sklearn_classifier.py @@ -1,5 +1,6 @@ import lightning as pl import numpy as np +import warnings import pandas as pd import torch from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint @@ -9,7 +10,7 @@ from torch.utils.data import DataLoader from ..base_models.classifier import BaseMambularClassifier -from ..utils.config import MambularConfig +from ..utils.configs import DefaultMambularConfig from ..utils.dataset import MambularDataModule, MambularDataset from ..utils.preprocessor import Preprocessor @@ -24,10 +25,72 @@ class MambularClassifier(BaseEstimator): Parameters ---------- - **kwargs : Various - Accepts any number of keyword arguments that are passed to the MambularConfig and Preprocessor classes. - Known configuration arguments for the model are extracted based on a predefined list, and the rest are - passed to the Preprocessor. + # configuration parameters + lr : float, optional + Learning rate for the optimizer. Default is 1e-4. + lr_patience : int, optional + Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10. + weight_decay : float, optional + Weight decay (L2 penalty) coefficient. Default is 1e-6. + lr_factor : float, optional + Factor by which the learning rate will be reduced. Default is 0.1. + d_model : int, optional + Dimension of the model. Default is 64. + n_layers : int, optional + Number of layers. Default is 8. + expand_factor : int, optional + Expansion factor. Default is 2. + bias : bool, optional + Whether to use bias. Default is False. + d_conv : int, optional + Dimension of the convolution. Default is 16. + conv_bias : bool, optional + Whether to use bias in the convolution. Default is True. + dropout : float, optional + Dropout rate in the mamba blocks. Default is 0.05. + dt_rank : str, optional + Rank of the time dimension. Default is "auto". + d_state : int, optional + State dimension. Default is 16. + dt_scale : float, optional + Scale of the time dimension. Default is 1.0. + dt_init : str, optional + Initialization method for the time dimension. Default is "random". + dt_max : float, optional + Maximum value for the time dimension. Default is 0.1. + dt_min : float, optional + Minimum value for the time dimension. Default is 1e-3. + dt_init_floor : float, optional + Floor value for the time dimension initialization. Default is 1e-4. + norm : str, optional + Normalization method. Default is 'RMSNorm'. + activation : callable, optional + Activation function. Default is nn.SELU(). + num_embedding_activation : callable, optional + Activation function for numerical embeddings. Default is nn.Identity(). + head_layer_sizes : list, optional + Sizes of the layers in the head. Default is [64, 64, 32]. + head_dropout : float, optional + Dropout rate for the head. Default is 0.5. + head_skip_layers : bool, optional + Whether to use skip layers in the head. Default is False. + head_activation : callable, optional + Activation function for the head. Default is nn.SELU(). + head_use_batch_norm : bool, optional + Whether to use batch normalization in the head. Default is False. + + # Preprocessor Parameters + n_bins : int, optional + The number of bins to use for numerical feature binning. Default is 50. + numerical_preprocessing : str, optional + The preprocessing strategy for numerical features. Default is 'ple'. + use_decision_tree_bins : bool, optional + If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False. + binning_strategy : str, optional + Defines the strategy for binning numerical features. Default is 'uniform'. + task : str, optional + Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'. + Attributes @@ -42,41 +105,56 @@ class MambularClassifier(BaseEstimator): def __init__(self, **kwargs): # Known config arguments - print("Received kwargs:", kwargs) config_arg_names = [ + "lr", + "lr_patience", + "weight_decay", + "lr_factor", "d_model", "n_layers", - "dt_rank", - "output_dimension", - "pooling_method", - "norm", - "cls", - "dt_min", - "dt_max", - "dropout", + "expand_factor", "bias", - "weight_decay", + "d_conv", "conv_bias", + "dropout", + "dt_rank", "d_state", - "expand_factor", - "d_conv", - "dt_init", "dt_scale", + "dt_init", + "dt_max", + "dt_min", "dt_init_floor", - "tabular_head_units", - "tabular_head_activation", - "tabular_head_dropout", - "num_emebedding_activation", - "layer_norm_after_embedding", + "norm", + "activation", + "num_embedding_activation", + "head_layer_sizes", + "head_dropout", + "head_skip_layers", + "head_activation", + "head_use_batch_norm", ] - self.config_kwargs = {k: v for k, - v in kwargs.items() if k in config_arg_names} - self.config = MambularConfig(**self.config_kwargs) - # The rest are assumed to be preprocessor arguments + preprocessor_arg_names = [ + "n_bins", + "numerical_preprocessing", + "use_decision_tree_bins", + "binning_strategy", + "task", + ] + self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names} + self.config = DefaultMambularConfig(**self.config_kwargs) + preprocessor_kwargs = { - k: v for k, v in kwargs.items() if k not in config_arg_names + k: v for k, v in kwargs.items() if k in preprocessor_arg_names } + # Raise a warning if task is set to 'classification' + if preprocessor_kwargs.get("task") == "regression": + warnings.warn( + "The task is set to 'regression'. MambularClassifier is designed for classification tasks. Setting the task to classification", + UserWarning, + ) + preprocessor_kwargs["task"] = "classification" + self.preprocessor = Preprocessor(**preprocessor_kwargs) self.model = None @@ -126,8 +204,7 @@ def set_params(self, **parameters): """ # Update config_kwargs with provided parameters valid_config_keys = self.config_kwargs.keys() - config_updates = {k: v for k, - v in parameters.items() if k in valid_config_keys} + config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys} self.config_kwargs.update(config_updates) # Update the config object @@ -199,8 +276,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): An instance of MambularDataModule containing training and validation DataLoaders. """ - train_preprocessed_data = self.preprocessor.fit_transform( - X_train, y_train) + train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train) val_preprocessed_data = self.preprocessor.transform(X_val) # Update feature info based on the actual processed data @@ -220,26 +296,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): cat_key = "cat_" + key # Assuming categorical keys are prefixed with 'cat_' if cat_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long) ) if cat_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long) ) binned_key = "num_" + key # for binned features if binned_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long) ) if binned_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long) ) # Populate tensors for numerical features, if present in processed data @@ -247,13 +319,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): num_key = "num_" + key # Assuming numerical keys are prefixed with 'num_' if num_key in train_preprocessed_data: train_num_tensors.append( - torch.tensor( - train_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(train_preprocessed_data[num_key], dtype=torch.float) ) if num_key in val_preprocessed_data: val_num_tensors.append( - torch.tensor( - val_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(val_preprocessed_data[num_key], dtype=torch.float) ) train_labels = torch.tensor(y_train, dtype=torch.long) @@ -326,20 +396,20 @@ def fit( self, X, y, - val_size=0.2, + val_size: float = 0.2, X_val=None, y_val=None, - max_epochs=100, - random_state=101, - batch_size=64, - shuffle=True, - patience=10, - monitor="val_loss", - mode="min", - lr=1e-3, - lr_patience=10, - factor=0.75, - weight_decay=0.025, + max_epochs: int = 100, + random_state: int = 101, + batch_size: int = 128, + shuffle: bool = True, + patience: int = 15, + monitor: str = "val_loss", + mode: str = "min", + lr: float = 1e-4, + lr_patience: int = 10, + factor: float = 0.1, + weight_decay: float = 1e-06, **trainer_kwargs ): """ @@ -489,14 +559,23 @@ def predict(self, X): # Perform inference with torch.no_grad(): logits = self.model(cat_tensors, num_tensors) - predictions = torch.argmax(logits, dim=1) + + # Check the shape of the logits to determine binary or multi-class classification + if logits.shape[1] == 1: + # Binary classification + probabilities = torch.sigmoid(logits) + predictions = (probabilities > 0.5).long().squeeze() + else: + # Multi-class classification + probabilities = torch.softmax(logits, dim=1) + predictions = torch.argmax(probabilities, dim=1) # Convert predictions to NumPy array and return return predictions.cpu().numpy() def predict_proba(self, X): """ - Predict class probabilities for the given input samples. + Predict class probabilities for the given input samples. Parameters ---------- @@ -554,7 +633,10 @@ def predict_proba(self, X): # Perform inference with torch.no_grad(): logits = self.model(cat_tensors, num_tensors) - probabilities = torch.softmax(logits, dim=1) + if logits.shape[1] > 1: + probabilities = torch.softmax(logits, dim=1) + else: + probabilities = torch.sigmoid(logits) # Convert probabilities to NumPy array and return return probabilities.cpu().numpy() From 234394cf961336dc573b21e764b0091d8b7f021f Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 14:27:31 +0000 Subject: [PATCH 12/21] adjust LSS basemodel for new config file --- mambular/base_models/distributional.py | 232 ++++++++++++++++--------- 1 file changed, 148 insertions(+), 84 deletions(-) diff --git a/mambular/base_models/distributional.py b/mambular/base_models/distributional.py index 3ed84ff..0d0db5d 100644 --- a/mambular/base_models/distributional.py +++ b/mambular/base_models/distributional.py @@ -2,13 +2,27 @@ import torch import torch.nn as nn -from ..utils.config import MambularConfig -from ..utils.distributions import (BetaDistribution, CategoricalDistribution, - DirichletDistribution, GammaDistribution, - InverseGammaDistribution, - NegativeBinomialDistribution, - NormalDistribution, PoissonDistribution, - StudentTDistribution) +from ..utils.configs import DefaultMambularConfig +from ..utils.mlp_utils import MLP +from ..utils.distributions import ( + BetaDistribution, + CategoricalDistribution, + DirichletDistribution, + GammaDistribution, + InverseGammaDistribution, + NegativeBinomialDistribution, + NormalDistribution, + PoissonDistribution, + StudentTDistribution, +) +from ..utils.normalization_layers import ( + RMSNorm, + LayerNorm, + LearnableLayerScaling, + BatchNorm, + InstanceNorm, + GroupNorm, +) from ..utils.mamba_arch import Mamba @@ -24,71 +38,74 @@ class BaseMambularLSS(pl.LightningModule): family : str The name of the statistical distribution family to be used for modeling. Supported families include 'normal', 'poisson', 'gamma', 'beta', 'dirichlet', 'studentt', 'negativebinom', 'inversegamma', and 'categorical'. - config : MambularConfig - An instance of MambularConfig containing configuration parameters for the model architecture. - cat_feature_info : dict, optional - A dictionary mapping the names of categorical features to their number of unique categories. Defaults to None. - num_feature_info : dict, optional - A dictionary mapping the names of numerical features to their number of dimensions after embedding. Defaults to None. - lr : float, optional - The initial learning rate for the optimizer. Defaults to 1e-03. - lr_patience : int, optional - The number of epochs with no improvement after which learning rate will be reduced. Defaults to 10. - weight_decay : float, optional - Weight decay (L2 penalty) coefficient. Defaults to 0.025. - lr_factor : float, optional - Factor by which the learning rate will be reduced. Defaults to 0.75. - **distribution_params : - Additional parameters specific to the chosen statistical distribution family. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + config : DefaultMambularConfig, optional + Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()). + **kwargs : dict + Additional keyword arguments. Attributes ---------- + lr : float + Learning rate. + lr_patience : int + Patience for learning rate scheduler. + weight_decay : float + Weight decay for optimizer. + lr_factor : float + Factor by which the learning rate will be reduced. + pooling_method : str + Method to pool the features. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + embedding_activation : callable + Activation function for embeddings. mamba : Mamba - The core neural network module implementing the Mamba architecture. + Mamba architecture component. norm_f : nn.Module - Normalization layer applied after the Mamba block. - tabular_head : nn.Linear - Final linear layer mapping the features to the parameters of the chosen statistical distribution. - loss_fct : callable - The loss function derived from the chosen statistical distribution. + Normalization layer. + num_embeddings : nn.ModuleList + Module list for numerical feature embeddings. + cat_embeddings : nn.ModuleList + Module list for categorical feature embeddings. + tabular_head : MLP + Multi-layer perceptron head for tabular data. + cls_token : nn.Parameter + Class token parameter. + loss_fct : nn.Module + Loss function. + embedding_norm : nn.Module, optional + Layer normalization applied after embedding if specified. """ def __init__( self, family, - config: MambularConfig, - cat_feature_info: dict = None, - num_feature_info: dict = None, - lr=1e-03, - lr_patience=10, - weight_decay=0.025, - lr_factor=0.75, + cat_feature_info, + num_feature_info, + config: DefaultMambularConfig = DefaultMambularConfig(), **distribution_params, ): super().__init__() - self.config = config - self.lr = lr - self.lr_patience = lr_patience - self.weight_decay = weight_decay - self.lr_factor = lr_factor + # Save all hyperparameters + self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"]) + + # Assigning values from hyperparameters + self.lr = self.hparams.get("lr", config.lr) + self.lr_patience = self.hparams.get("lr_patience", config.lr_patience) + self.weight_decay = self.hparams.get("weight_decay", config.weight_decay) + self.lr_factor = self.hparams.get("lr_factor", config.lr_factor) + self.pooling_method = self.hparams.get("pooling_method", config.pooling_method) self.cat_feature_info = cat_feature_info self.num_feature_info = num_feature_info - activations = { - "relu": nn.ReLU(), - "tanh": nn.Tanh(), - "sigmoid": nn.Sigmoid(), - "leaky_relu": nn.LeakyReLU(), - "elu": nn.ELU(), - "selu": nn.SELU(), - "gelu": nn.GELU(), - "softplus": nn.Softplus(), - "leakyrelu": nn.LeakyReLU(), - "linear": nn.Identity(), - } - distribution_classes = { "normal": NormalDistribution, "poisson": PoissonDistribution, @@ -107,63 +124,106 @@ def __init__( else: raise ValueError("Unsupported family: {}".format(family)) - self.embedding_activation = activations.get( - self.config.num_embedding_activation.lower() - ) + # Set the normalization layer dynamically + norm_layer = self.hparams.get("norm", config.norm) + if norm_layer == "RMSNorm": + self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LayerNorm": + self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "BatchNorm": + self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "InstanceNorm": + self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "GroupNorm": + self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LearnableLayerScaling": + self.norm_f = LearnableLayerScaling( + self.hparams.get("d_model", config.d_model) + ) + else: + raise ValueError(f"Unsupported normalization layer: {norm_layer}") + if self.embedding_activation is None: raise ValueError( - f"Unsupported activation function: {self.config.num_embedding_activation}" + f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}" ) + # Additional layers and components initialization based on hyperparameters + self.mamba = Mamba( + d_model=self.hparams.get("d_model", config.d_model), + n_layers=self.hparams.get("n_layers", config.n_layers), + expand_factor=self.hparams.get("expand_factor", config.expand_factor), + bias=self.hparams.get("bias", config.bias), + d_conv=self.hparams.get("d_conv", config.d_conv), + conv_bias=self.hparams.get("conv_bias", config.conv_bias), + dropout=self.hparams.get("dropout", config.dropout), + dt_rank=self.hparams.get("dt_rank", config.dt_rank), + d_state=self.hparams.get("d_state", config.d_state), + dt_scale=self.hparams.get("dt_scale", config.dt_scale), + dt_init=self.hparams.get("dt_init", config.dt_init), + dt_max=self.hparams.get("dt_max", config.dt_max), + dt_min=self.hparams.get("dt_min", config.dt_min), + dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor), + norm=globals()[self.hparams.get("norm", config.norm)], + activation=self.hparams.get("activation", config.activation), + ) + self.num_embeddings = nn.ModuleList( [ nn.Sequential( - nn.Linear(input_shape, self.config.d_model, bias=False), - nn.BatchNorm1d(self.config.d_model), - # Example using ReLU as the activation function, change as needed + nn.Linear( + input_shape, + self.hparams.get("d_model", config.d_model), + bias=False, + ), self.embedding_activation, ) for feature_name, input_shape in num_feature_info.items() ] ) - # Create embedding layers for categorical features based on cat_feature_info self.cat_embeddings = nn.ModuleList( [ - nn.Embedding(num_categories + 1, self.config.d_model) + nn.Embedding( + num_categories + 1, self.hparams.get("d_model", config.d_model) + ) for feature_name, num_categories in cat_feature_info.items() ] ) - mlp_activation_fn = activations.get( - self.config.tabular_head_activation.lower(), nn.Identity() - ) + head_activation = self.hparams.get("head_activation", config.head_activation) - self.mamba = Mamba(self.config) - self.norm_f = self.config.norm(self.config.d_model) - mlp_layers = [] - input_dim = self.config.d_model # Initial input dimension + self.tabular_head = MLP( + self.hparams.get("d_model", config.d_model), + hidden_units_list=self.hparams.get( + "head_layer_sizes", config.head_layer_sizes + ), + dropout_rate=self.hparams.get("head_dropout", config.head_dropout), + use_skip_layers=self.hparams.get( + "head_skip_layers", config.head_skip_layers + ), + activation_fn=head_activation, + use_batch_norm=self.hparams.get( + "head_use_batch_norm", config.head_use_batch_norm + ), + output_units=self.family.param_count, + ) - # Iterate over the specified units for each layer in the MLP - for units in self.config.tabular_head_units: - mlp_layers.append(nn.Linear(input_dim, units)) - mlp_layers.append(mlp_activation_fn) - mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout)) - input_dim = units + self.cls_token = nn.Parameter( + torch.zeros(1, 1, self.hparams.get("d_model", config.d_model)) + ) - # Add the final linear layer to map to #distributional param output values - mlp_layers.append(nn.Linear(input_dim, self.family.param_count)) + self.loss_fct = nn.MSELoss() - # Combine all layers into a Sequential module - self.tabular_head = nn.Sequential(*mlp_layers) + if self.hparams.get("layer_norm_after_embedding"): + self.embedding_norm = nn.LayerNorm( + self.hparams.get("d_model", config.d_model) + ) self.loss_fct = lambda predictions, y_true: self.family.compute_loss( predictions, y_true ) - self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model)) - self.pooling_method = self.config.pooling_method - def forward(self, cat_features, num_features): """ Defines the forward pass of the model, processing both categorical and numerical features, @@ -197,6 +257,8 @@ def forward(self, cat_features, num_features): ] cat_embeddings = torch.stack(cat_embeddings, dim=1) cat_embeddings = torch.squeeze(cat_embeddings, dim=2) + if self.hparams.get("layer_norm_after_embedding"): + cat_embeddings = self.embedding_norm(cat_embeddings) else: cat_embeddings = None @@ -206,6 +268,8 @@ def forward(self, cat_features, num_features): emb(num_features[i]) for i, emb in enumerate(self.num_embeddings) ] num_embeddings = torch.stack(num_embeddings, dim=1) + if self.hparams.get("layer_norm_after_embedding"): + num_embeddings = self.embedding_norm(num_embeddings) else: num_embeddings = None @@ -304,7 +368,7 @@ def configure_optimizers(self): A dictionary containing the optimizer and lr_scheduler configurations. """ optimizer = torch.optim.Adam( - self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay + self.parameters(), lr=self.lr, weight_decay=self.weight_decay ) scheduler = { "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau( From ca92f7f62818c0ffb70c810e1ea4d3893ae471fa Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 14:52:59 +0000 Subject: [PATCH 13/21] delete unnecessary valueerror --- mambular/base_models/regressor.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mambular/base_models/regressor.py b/mambular/base_models/regressor.py index 4171042..c8fca2c 100644 --- a/mambular/base_models/regressor.py +++ b/mambular/base_models/regressor.py @@ -129,11 +129,6 @@ def __init__( else: raise ValueError(f"Unsupported normalization layer: {norm_layer}") - if self.embedding_activation is None: - raise ValueError( - f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}" - ) - self.num_embeddings = nn.ModuleList( [ nn.Sequential( From 16d72922818919fb9e6cb047de1ea332893de5bb Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 14:53:13 +0000 Subject: [PATCH 14/21] adjust distributional regression to new config --- mambular/base_models/distributional.py | 31 ++-- mambular/models/sklearn_distributional.py | 214 +++++++++++++++------- 2 files changed, 158 insertions(+), 87 deletions(-) diff --git a/mambular/base_models/distributional.py b/mambular/base_models/distributional.py index 0d0db5d..bcc7ead 100644 --- a/mambular/base_models/distributional.py +++ b/mambular/base_models/distributional.py @@ -78,8 +78,6 @@ class BaseMambularLSS(pl.LightningModule): Multi-layer perceptron head for tabular data. cls_token : nn.Parameter Class token parameter. - loss_fct : nn.Module - Loss function. embedding_norm : nn.Module, optional Layer normalization applied after embedding if specified. """ @@ -90,7 +88,8 @@ def __init__( cat_feature_info, num_feature_info, config: DefaultMambularConfig = DefaultMambularConfig(), - **distribution_params, + distributional_kwargs=None, + **kwargs, ): super().__init__() @@ -106,6 +105,10 @@ def __init__( self.cat_feature_info = cat_feature_info self.num_feature_info = num_feature_info + self.embedding_activation = self.hparams.get( + "num_embedding_activation", config.num_embedding_activation + ) + distribution_classes = { "normal": NormalDistribution, "poisson": PoissonDistribution, @@ -118,9 +121,11 @@ def __init__( "categorical": CategoricalDistribution, } + if distributional_kwargs is None: + distributional_kwargs = {} + if family in distribution_classes: - # Pass additional distribution_params to the constructor of the distribution class - self.family = distribution_classes[family](**distribution_params) + self.family = distribution_classes[family](**distributional_kwargs) else: raise ValueError("Unsupported family: {}".format(family)) @@ -143,11 +148,6 @@ def __init__( else: raise ValueError(f"Unsupported normalization layer: {norm_layer}") - if self.embedding_activation is None: - raise ValueError( - f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}" - ) - # Additional layers and components initialization based on hyperparameters self.mamba = Mamba( d_model=self.hparams.get("d_model", config.d_model), @@ -206,7 +206,7 @@ def __init__( use_batch_norm=self.hparams.get( "head_use_batch_norm", config.head_use_batch_norm ), - output_units=self.family.param_count, + n_output_units=self.family.param_count, ) self.cls_token = nn.Parameter( @@ -220,9 +220,8 @@ def __init__( self.hparams.get("d_model", config.d_model) ) - self.loss_fct = lambda predictions, y_true: self.family.compute_loss( - predictions, y_true - ) + def compute_loss(self, predictions, y_true): + return self.family.compute_loss(predictions, y_true) def forward(self, cat_features, num_features): """ @@ -322,7 +321,7 @@ def training_step(self, batch, batch_idx): num_features, cat_features, labels = batch preds = self(num_features, cat_features) - loss = self.loss_fct(preds, labels) + loss = self.compute_loss(preds, labels) self.log( "train_loss", loss, @@ -348,7 +347,7 @@ def validation_step(self, batch, batch_idx): num_features, cat_features, labels = batch preds = self(num_features, cat_features) - loss = self.loss_fct(preds, labels) + loss = self.compute_loss(preds, labels) self.log( "val_loss", loss, diff --git a/mambular/models/sklearn_distributional.py b/mambular/models/sklearn_distributional.py index 550b4b3..3fbf006 100644 --- a/mambular/models/sklearn_distributional.py +++ b/mambular/models/sklearn_distributional.py @@ -1,5 +1,6 @@ import lightning as pl import numpy as np +import warnings import pandas as pd import properscoring as ps import torch @@ -10,12 +11,17 @@ from torch.utils.data import DataLoader from ..base_models.distributional import BaseMambularLSS -from ..utils.config import MambularConfig +from ..utils.configs import DefaultMambularConfig from ..utils.dataset import MambularDataModule, MambularDataset -from ..utils.distributional_metrics import (beta_brier_score, dirichlet_error, - gamma_deviance, inverse_gamma_loss, - negative_binomial_deviance, - poisson_deviance, student_t_loss) +from ..utils.distributional_metrics import ( + beta_brier_score, + dirichlet_error, + gamma_deviance, + inverse_gamma_loss, + negative_binomial_deviance, + poisson_deviance, + student_t_loss, +) from ..utils.preprocessor import Preprocessor @@ -27,62 +33,139 @@ class MambularLSS(BaseEstimator): facilitating end-to-end training and prediction workflows. The initialization of this class separates configuration arguments for the model and - the preprocessor, allowing for flexible adjustment of parameters. + the preprocessor, allowing for flexible adjustment of parameters. Parameters ---------- - **kwargs : Arbitrary keyword arguments, divided into configuration for the model and - preprocessing. Recognized keys include model parameters such as 'd_model', - 'n_layers', etc., and any additional keys are assumed to be preprocessor arguments. + # configuration parameters + lr : float, optional + Learning rate for the optimizer. Default is 1e-4. + lr_patience : int, optional + Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10. + weight_decay : float, optional + Weight decay (L2 penalty) coefficient. Default is 1e-6. + lr_factor : float, optional + Factor by which the learning rate will be reduced. Default is 0.1. + d_model : int, optional + Dimension of the model. Default is 64. + n_layers : int, optional + Number of layers. Default is 8. + expand_factor : int, optional + Expansion factor. Default is 2. + bias : bool, optional + Whether to use bias. Default is False. + d_conv : int, optional + Dimension of the convolution. Default is 16. + conv_bias : bool, optional + Whether to use bias in the convolution. Default is True. + dropout : float, optional + Dropout rate in the mamba blocks. Default is 0.05. + dt_rank : str, optional + Rank of the time dimension. Default is "auto". + d_state : int, optional + State dimension. Default is 16. + dt_scale : float, optional + Scale of the time dimension. Default is 1.0. + dt_init : str, optional + Initialization method for the time dimension. Default is "random". + dt_max : float, optional + Maximum value for the time dimension. Default is 0.1. + dt_min : float, optional + Minimum value for the time dimension. Default is 1e-3. + dt_init_floor : float, optional + Floor value for the time dimension initialization. Default is 1e-4. + norm : str, optional + Normalization method. Default is 'RMSNorm'. + activation : callable, optional + Activation function. Default is nn.SELU(). + num_embedding_activation : callable, optional + Activation function for numerical embeddings. Default is nn.Identity(). + head_layer_sizes : list, optional + Sizes of the layers in the head. Default is [64, 64, 32]. + head_dropout : float, optional + Dropout rate for the head. Default is 0.5. + head_skip_layers : bool, optional + Whether to use skip layers in the head. Default is False. + head_activation : callable, optional + Activation function for the head. Default is nn.SELU(). + head_use_batch_norm : bool, optional + Whether to use batch normalization in the head. Default is False. + + # Preprocessor Parameters + n_bins : int, optional + The number of bins to use for numerical feature binning. Default is 50. + numerical_preprocessing : str, optional + The preprocessing strategy for numerical features. Default is 'ple'. + use_decision_tree_bins : bool, optional + If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False. + binning_strategy : str, optional + Defines the strategy for binning numerical features. Default is 'uniform'. + task : str, optional + Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'. + Attributes ---------- config : MambularConfig - Configuration object containing model-specific parameters. + Configuration object that holds model-specific settings. preprocessor : Preprocessor - Preprocessor object for data preprocessing steps. - model : torch.nn.Module - The neural network model, initialized based on 'config'. - - + Preprocessor object for handling feature preprocessing like normalization and encoding. + model : BaseMambularClassifier or None + The underlying PyTorch Lightning model, instantiated upon calling the `fit` method. """ def __init__(self, **kwargs): # Known config arguments config_arg_names = [ + "lr", + "lr_patience", + "weight_decay", + "lr_factor", "d_model", "n_layers", - "dt_rank", - "output_dimension", - "pooling_method", - "norm", - "cls", - "dt_min", - "dt_max", - "dropout", + "expand_factor", "bias", - "weight_decay", + "d_conv", "conv_bias", + "dropout", + "dt_rank", "d_state", - "expand_factor", - "d_conv", - "dt_init", "dt_scale", + "dt_init", + "dt_max", + "dt_min", "dt_init_floor", - "tabular_head_units", - "tabular_head_activation", - "tabular_head_dropout", - "num_emebedding_activation", + "norm", + "activation", + "num_embedding_activation", + "head_layer_sizes", + "head_dropout", + "head_skip_layers", + "head_activation", + "head_use_batch_norm", + ] + + preprocessor_arg_names = [ + "n_bins", + "numerical_preprocessing", + "use_decision_tree_bins", + "binning_strategy", + "task", ] - config_kwargs = {k: v for k, - v in kwargs.items() if k in config_arg_names} - self.config = MambularConfig(**config_kwargs) + self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names} + self.config = DefaultMambularConfig(**self.config_kwargs) - # The rest are assumed to be preprocessor arguments preprocessor_kwargs = { - k: v for k, v in kwargs.items() if k not in config_arg_names + k: v for k, v in kwargs.items() if k in preprocessor_arg_names } + # Raise a warning if task is set to 'classification' + if preprocessor_kwargs.get("task") == "regression": + warnings.warn( + "The task in preprocessing binning is set to 'regression'. Make sure that this is correct for your distributional family ", + UserWarning, + ) + self.preprocessor = Preprocessor(**preprocessor_kwargs) self.model = None @@ -135,8 +218,7 @@ def set_params(self, **parameters): """ # Update config_kwargs with provided parameters valid_config_keys = self.config_kwargs.keys() - config_updates = {k: v for k, - v in parameters.items() if k in valid_config_keys} + config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys} self.config_kwargs.update(config_updates) # Update the config object @@ -210,8 +292,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): MambularDataModule An object containing DataLoaders for training and validation datasets. """ - train_preprocessed_data = self.preprocessor.fit_transform( - X_train, y_train) + train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train) val_preprocessed_data = self.preprocessor.transform(X_val) # Update feature info based on the actual processed data @@ -231,26 +312,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): cat_key = "cat_" + key # Assuming categorical keys are prefixed with 'cat_' if cat_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long) ) if cat_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long) ) binned_key = "num_" + key # for binned features if binned_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long) ) if binned_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long) ) # Populate tensors for numerical features, if present in processed data @@ -258,13 +335,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): num_key = "num_" + key # Assuming numerical keys are prefixed with 'num_' if num_key in train_preprocessed_data: train_num_tensors.append( - torch.tensor( - train_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(train_preprocessed_data[num_key], dtype=torch.float) ) if num_key in val_preprocessed_data: val_num_tensors.append( - torch.tensor( - val_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(val_preprocessed_data[num_key], dtype=torch.float) ) train_labels = torch.tensor(y_train, dtype=torch.float) @@ -274,8 +349,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): train_dataset = MambularDataset( train_cat_tensors, train_num_tensors, train_labels ) - val_dataset = MambularDataset( - val_cat_tensors, val_num_tensors, val_labels) + val_dataset = MambularDataset(val_cat_tensors, val_num_tensors, val_labels) # Create dataloaders train_dataloader = DataLoader( @@ -336,21 +410,21 @@ def fit( X, y, family, - val_size=0.2, + val_size: float = 0.2, X_val=None, y_val=None, - max_epochs=100, - random_state=101, - batch_size=64, - shuffle=True, - patience=10, - monitor="val_loss", - mode="min", - lr=1e-3, - lr_patience=10, - factor=0.75, - weight_decay=0.025, - **trainer_kwargs, + max_epochs: int = 100, + random_state: int = 101, + batch_size: int = 128, + shuffle: bool = True, + patience: int = 15, + monitor: str = "val_loss", + mode: str = "min", + lr: float = 1e-4, + lr_patience: int = 10, + factor: float = 0.1, + weight_decay: float = 1e-06, + **trainer_kwargs ): """ Fits the model to the provided data, using the specified loss distribution family for the prediction task. @@ -519,8 +593,7 @@ def evaluate(self, X, y_true, metrics=None, distribution_family=None): """ # Infer distribution family from model settings if not provided if distribution_family is None: - distribution_family = getattr( - self.model, "distribution_family", "normal") + distribution_family = getattr(self.model, "distribution_family", "normal") # Setup default metrics if none are provided if metrics is None: @@ -558,8 +631,7 @@ def get_default_metrics(self, distribution_family): "MSE": lambda y, pred: mean_squared_error(y, pred[:, 0]), "CRPS": lambda y, pred: np.mean( [ - ps.crps_gaussian(y[i], mu=pred[i, 0], - sig=np.sqrt(pred[i, 1])) + ps.crps_gaussian(y[i], mu=pred[i, 0], sig=np.sqrt(pred[i, 1])) for i in range(len(y)) ] ), From ea407dd1b30ced154036270366f90695b0b83cc0 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 14:53:27 +0000 Subject: [PATCH 15/21] adjust prepro task warning --- mambular/models/sklearn_classifier.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mambular/models/sklearn_classifier.py b/mambular/models/sklearn_classifier.py index bc6626e..0e04918 100644 --- a/mambular/models/sklearn_classifier.py +++ b/mambular/models/sklearn_classifier.py @@ -150,9 +150,11 @@ def __init__(self, **kwargs): # Raise a warning if task is set to 'classification' if preprocessor_kwargs.get("task") == "regression": warnings.warn( - "The task is set to 'regression'. MambularClassifier is designed for classification tasks. Setting the task to classification", + "The task in preprocessing binning is set to 'regression'. MambularClassifier is designed for classification tasks.", UserWarning, ) + + if "task" not in list(preprocessor_kwargs.keys()): preprocessor_kwargs["task"] = "classification" self.preprocessor = Preprocessor(**preprocessor_kwargs) From 35a22b3d75717b16b306ad6f61ea05ba03fff1f1 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 15:16:20 +0000 Subject: [PATCH 16/21] delete valuerror warning --- mambular/base_models/classifier.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mambular/base_models/classifier.py b/mambular/base_models/classifier.py index 8c818a7..75aa29d 100644 --- a/mambular/base_models/classifier.py +++ b/mambular/base_models/classifier.py @@ -36,7 +36,7 @@ class BaseMambularClassifier(pl.LightningModule): Additional keyword arguments. - Attributes + Attributes ---------- lr : float Learning rate. @@ -145,11 +145,6 @@ def __init__( else: raise ValueError(f"Unsupported normalization layer: {norm_layer}") - if self.embedding_activation is None: - raise ValueError( - f"Unsupported activation function: {self.hparams.get('num_embedding_activation')}" - ) - self.num_embeddings = nn.ModuleList( [ nn.Sequential( From 6b2813e2ae4fedf45e31b869b6d446181e5e1fa0 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 15:30:28 +0000 Subject: [PATCH 17/21] adjust embedding regressor classes to new config --- mambular/base_models/embedding_regressor.py | 216 +++++++++++------- .../models/sklearn_embedding_regressor.py | 155 +++++++++---- 2 files changed, 253 insertions(+), 118 deletions(-) diff --git a/mambular/base_models/embedding_regressor.py b/mambular/base_models/embedding_regressor.py index d7c3b93..5d0921b 100644 --- a/mambular/base_models/embedding_regressor.py +++ b/mambular/base_models/embedding_regressor.py @@ -1,8 +1,16 @@ import lightning as pl import torch import torch.nn as nn - -from ..utils.config import MambularConfig +from ..utils.normalization_layers import ( + RMSNorm, + LayerNorm, + LearnableLayerScaling, + BatchNorm, + InstanceNorm, + GroupNorm, +) + +from ..utils.configs import DefaultMambularConfig from ..utils.mamba_arch import Mamba from ..utils.mlp_utils import MLP @@ -15,20 +23,12 @@ class BaseEmbeddingMambularRegressor(pl.LightningModule): Parameters ---------- - config : MambularConfig - Configuration parameters for the model architecture. - cat_feature_info : dict, optional - Information about categorical features, mapping feature names to the number of unique categories. Defaults to None. - num_feature_info : dict, optional - Information about numerical features, mapping feature names to their number of dimensions after embedding. Defaults to None. - lr : float, optional - Learning rate for the optimizer. Defaults to 1e-03. - lr_patience : int, optional - Number of epochs with no improvement after which learning rate will be reduced. Defaults to 10. - weight_decay : float, optional - Weight decay coefficient for regularization in the optimizer. Defaults to 0.025. - lr_factor : float, optional - Factor by which the learning rate will be reduced by the scheduler. Defaults to 0.75. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + config : DefaultMambularConfig, optional + Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()). seq_size : int, optional Size of sequence chunks for processing numerical features. Relevant when `raw_embeddings` is False. raw_embeddings : bool, optional @@ -37,66 +37,108 @@ class BaseEmbeddingMambularRegressor(pl.LightningModule): Attributes ---------- + lr : float + Learning rate. + lr_patience : int + Patience for learning rate scheduler. + weight_decay : float + Weight decay for optimizer. + lr_factor : float + Factor by which the learning rate will be reduced. + pooling_method : str + Method to pool the features. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + embedding_activation : callable + Activation function for embeddings. mamba : Mamba - The core neural network module implementing the Mamba architecture. + Mamba architecture component. norm_f : nn.Module - Normalization layer applied after the Mamba block. - tabular_head : nn.Linear - Final linear layer mapping the features to the regression target. - loss_fct : nn.MSELoss - The loss function for regression tasks. + Normalization layer. + num_embeddings : nn.ModuleList + Module list for numerical feature embeddings. + cat_embeddings : nn.ModuleList + Module list for categorical feature embeddings. + tabular_head : MLP + Multi-layer perceptron head for tabular data. + cls_token : nn.Parameter + Class token parameter. + embedding_norm : nn.Module, optional + Layer normalization applied after embedding if specified. + loss_fct : nn.Module + The loss function used for training the model, MSE loss. + """ def __init__( self, - config: MambularConfig, - cat_feature_info: dict = None, - num_feature_info: dict = None, - lr=1e-03, - lr_patience=10, - weight_decay=0.025, - lr_factor=0.75, + cat_feature_info, + num_feature_info, + config: DefaultMambularConfig = DefaultMambularConfig(), seq_size: int = 20, raw_embeddings=False, - head_layer_sizes=[64, 32, 32], - head_dropout: float = 0.3, - head_skip_layers: bool = False, - head_activation="leakyrelu", - head_use_batch_norm: bool = False, - attn_dropout: float = 0.3, + **kwargs, ): super().__init__() - self.config = config - self.lr = lr - self.lr_patience = lr_patience - self.weight_decay = weight_decay - self.lr_factor = lr_factor + # Save all hyperparameters + self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"]) + + # Assigning values from hyperparameters + self.lr = self.hparams.get("lr", config.lr) + self.lr_patience = self.hparams.get("lr_patience", config.lr_patience) + self.weight_decay = self.hparams.get("weight_decay", config.weight_decay) + self.lr_factor = self.hparams.get("lr_factor", config.lr_factor) + self.pooling_method = self.hparams.get("pooling_method", config.pooling_method) self.cat_feature_info = cat_feature_info self.num_feature_info = num_feature_info + + self.embedding_activation = self.hparams.get( + "num_embedding_activation", config.num_embedding_activation + ) self.seq_size = seq_size self.raw_embeddings = raw_embeddings - activations = { - "relu": nn.ReLU(), - "tanh": nn.Tanh(), - "sigmoid": nn.Sigmoid(), - "leaky_relu": nn.LeakyReLU(), - "elu": nn.ELU(), - "selu": nn.SELU(), - "gelu": nn.GELU(), - "softplus": nn.Softplus(), - "leakyrelu": nn.LeakyReLU(), - "linear": nn.Identity(), - } - - self.embedding_activation = activations.get( - self.config.num_embedding_activation.lower() + # Additional layers and components initialization based on hyperparameters + self.mamba = Mamba( + d_model=self.hparams.get("d_model", config.d_model), + n_layers=self.hparams.get("n_layers", config.n_layers), + expand_factor=self.hparams.get("expand_factor", config.expand_factor), + bias=self.hparams.get("bias", config.bias), + d_conv=self.hparams.get("d_conv", config.d_conv), + conv_bias=self.hparams.get("conv_bias", config.conv_bias), + dropout=self.hparams.get("dropout", config.dropout), + dt_rank=self.hparams.get("dt_rank", config.dt_rank), + d_state=self.hparams.get("d_state", config.d_state), + dt_scale=self.hparams.get("dt_scale", config.dt_scale), + dt_init=self.hparams.get("dt_init", config.dt_init), + dt_max=self.hparams.get("dt_max", config.dt_max), + dt_min=self.hparams.get("dt_min", config.dt_min), + dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor), + norm=globals()[self.hparams.get("norm", config.norm)], + activation=self.hparams.get("activation", config.activation), ) - if self.embedding_activation is None: - raise ValueError( - f"Unsupported activation function: {self.config.num_embedding_activation}" + + # Set the normalization layer dynamically + norm_layer = self.hparams.get("norm", config.norm) + if norm_layer == "RMSNorm": + self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LayerNorm": + self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "BatchNorm": + self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "InstanceNorm": + self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "GroupNorm": + self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LearnableLayerScaling": + self.norm_f = LearnableLayerScaling( + self.hparams.get("d_model", config.d_model) ) + else: + raise ValueError(f"Unsupported normalization layer: {norm_layer}") if not self.raw_embeddings: data_size = len(num_feature_info.items()) @@ -104,7 +146,11 @@ def __init__( self.num_embeddings = nn.ModuleList( [ nn.Sequential( - nn.Linear(self.seq_size, self.config.d_model, bias=False), + nn.Linear( + self.seq_size, + self.hparams.get("d_model", config.d_model), + bias=False, + ), # Example using ReLU as the activation function, change as needed self.embedding_activation, ) @@ -117,43 +163,55 @@ def __init__( self.num_embeddings = nn.ModuleList( [ nn.Sequential( - nn.Linear(1, self.config.d_model, bias=False), + nn.Linear( + input_shape, + self.hparams.get("d_model", config.d_model), + bias=False, + ), # Example using ReLU as the activation function, change as needed self.embedding_activation, ) - for _ in range(num_embedding_modules) + for feature_name, input_shape in num_feature_info.items() ] ) self.cat_embeddings = nn.ModuleList( [ - nn.Embedding(num_categories + 1, self.config.d_model) + nn.Embedding( + num_categories + 1, self.hparams.get("d_model", config.d_model) + ) for feature_name, num_categories in cat_feature_info.items() ] ) - self.mamba = Mamba(self.config) - self.norm_f = self.config.norm(self.config.d_model) - head_activation = activations.get(head_activation.lower(), nn.Identity()) + head_activation = self.hparams.get("head_activation", config.head_activation) - # Combine all layers into a Sequential module self.tabular_head = MLP( - self.config.d_model, - hidden_units_list=head_layer_sizes, - dropout_rate=head_dropout, - use_skip_layers=head_skip_layers, + self.hparams.get("d_model", config.d_model), + hidden_units_list=self.hparams.get( + "head_layer_sizes", config.head_layer_sizes + ), + dropout_rate=self.hparams.get("head_dropout", config.head_dropout), + use_skip_layers=self.hparams.get( + "head_skip_layers", config.head_skip_layers + ), activation_fn=head_activation, - use_batch_norm=head_use_batch_norm, + use_batch_norm=self.hparams.get( + "head_use_batch_norm", config.head_use_batch_norm + ), ) - self.pooling_method = self.config.pooling_method - self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model)) - - if self.config.layer_norm_after_embedding: - self.embedding_norm = nn.LayerNorm(self.config.d_model) + self.cls_token = nn.Parameter( + torch.zeros(1, 1, self.hparams.get("d_model", config.d_model)) + ) self.loss_fct = nn.MSELoss() + if self.hparams.get("layer_norm_after_embedding"): + self.embedding_norm = nn.LayerNorm( + self.hparams.get("d_model", config.d_model) + ) + def forward(self, cat_features, num_features): """ Defines the forward pass of the model, processing both categorical and numerical features, @@ -219,7 +277,7 @@ def forward(self, cat_features, num_features): ] cat_embeddings = torch.stack(cat_embeddings, dim=1) cat_embeddings = torch.squeeze(cat_embeddings, dim=2) - if self.config.layer_norm_after_embedding: + if self.hparams.get("layer_norm_after_embedding"): cat_embeddings = self.embedding_norm(cat_embeddings) else: cat_embeddings = None @@ -230,7 +288,7 @@ def forward(self, cat_features, num_features): emb(num_features[i]) for i, emb in enumerate(self.num_embeddings) ] num_embeddings = torch.stack(num_embeddings, dim=1) - if self.config.layer_norm_after_embedding: + if self.hparams.get("layer_norm_after_embedding"): num_embeddings = self.embedding_norm(num_embeddings) else: num_embeddings = None @@ -331,7 +389,7 @@ def configure_optimizers(self): A dictionary containing the optimizer and lr_scheduler configurations. """ optimizer = torch.optim.Adam( - self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay + self.parameters(), lr=self.lr, weight_decay=self.weight_decay ) scheduler = { "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau( diff --git a/mambular/models/sklearn_embedding_regressor.py b/mambular/models/sklearn_embedding_regressor.py index e39bf31..632e8a3 100644 --- a/mambular/models/sklearn_embedding_regressor.py +++ b/mambular/models/sklearn_embedding_regressor.py @@ -1,4 +1,5 @@ import lightning as pl +import warnings import pandas as pd import torch from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint @@ -9,7 +10,7 @@ from torch.utils.data import DataLoader from ..base_models.embedding_regressor import BaseEmbeddingMambularRegressor -from ..utils.config import MambularConfig +from ..utils.configs import DefaultMambularConfig from ..utils.dataset import EmbeddingMambularDataset, MambularDataModule from ..utils.preprocessor import Preprocessor @@ -22,8 +23,71 @@ class EmbeddingMambularRegressor(BaseEstimator): Parameters ---------- - **kwargs : Keyword arguments that can include both configuration parameters for the MambularConfig and - parameters for the preprocessor. + # configuration parameters + lr : float, optional + Learning rate for the optimizer. Default is 1e-4. + lr_patience : int, optional + Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10. + weight_decay : float, optional + Weight decay (L2 penalty) coefficient. Default is 1e-6. + lr_factor : float, optional + Factor by which the learning rate will be reduced. Default is 0.1. + d_model : int, optional + Dimension of the model. Default is 64. + n_layers : int, optional + Number of layers. Default is 8. + expand_factor : int, optional + Expansion factor. Default is 2. + bias : bool, optional + Whether to use bias. Default is False. + d_conv : int, optional + Dimension of the convolution. Default is 16. + conv_bias : bool, optional + Whether to use bias in the convolution. Default is True. + dropout : float, optional + Dropout rate in the mamba blocks. Default is 0.05. + dt_rank : str, optional + Rank of the time dimension. Default is "auto". + d_state : int, optional + State dimension. Default is 16. + dt_scale : float, optional + Scale of the time dimension. Default is 1.0. + dt_init : str, optional + Initialization method for the time dimension. Default is "random". + dt_max : float, optional + Maximum value for the time dimension. Default is 0.1. + dt_min : float, optional + Minimum value for the time dimension. Default is 1e-3. + dt_init_floor : float, optional + Floor value for the time dimension initialization. Default is 1e-4. + norm : str, optional + Normalization method. Default is 'RMSNorm'. + activation : callable, optional + Activation function. Default is nn.SELU(). + num_embedding_activation : callable, optional + Activation function for numerical embeddings. Default is nn.Identity(). + head_layer_sizes : list, optional + Sizes of the layers in the head. Default is [64, 64, 32]. + head_dropout : float, optional + Dropout rate for the head. Default is 0.5. + head_skip_layers : bool, optional + Whether to use skip layers in the head. Default is False. + head_activation : callable, optional + Activation function for the head. Default is nn.SELU(). + head_use_batch_norm : bool, optional + Whether to use batch normalization in the head. Default is False. + + # Preprocessor Parameters + n_bins : int, optional + The number of bins to use for numerical feature binning. Default is 50. + numerical_preprocessing : str, optional + The preprocessing strategy for numerical features. Default is 'ple'. + use_decision_tree_bins : bool, optional + If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False. + binning_strategy : str, optional + Defines the strategy for binning numerical features. Default is 'uniform'. + task : str, optional + Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'. Attributes @@ -32,47 +96,68 @@ class EmbeddingMambularRegressor(BaseEstimator): Configuration object containing model-specific parameters. preprocessor : Preprocessor Preprocessor object for data preprocessing steps. - model : ProteinMambularRegressor + model : BaseEmbeddingMambularRegressor The neural network model, initialized after the `fit` method is called. """ def __init__(self, **kwargs): # Known config arguments config_arg_names = [ + "lr", + "lr_patience", + "weight_decay", + "lr_factor", "d_model", "n_layers", - "dt_rank", - "output_dimension", - "pooling_method", - "norm", - "cls", - "dt_min", - "dt_max", - "dropout", + "expand_factor", "bias", - "weight_decay", + "d_conv", "conv_bias", + "dropout", + "dt_rank", "d_state", - "expand_factor", - "d_conv", - "dt_init", "dt_scale", + "dt_init", + "dt_max", + "dt_min", "dt_init_floor", + "norm", + "activation", + "num_embedding_activation", + "head_layer_sizes", + "head_dropout", + "head_skip_layers", + "head_activation", + "head_use_batch_norm", ] - config_kwargs = {k: v for k, - v in kwargs.items() if k in config_arg_names} - self.config = MambularConfig(**config_kwargs) - # The rest are assumed to be preprocessor arguments + preprocessor_arg_names = [ + "n_bins", + "numerical_preprocessing", + "use_decision_tree_bins", + "binning_strategy", + "task", + ] + + self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names} + self.config = DefaultMambularConfig(**self.config_kwargs) + preprocessor_kwargs = { - k: v for k, v in kwargs.items() if k not in config_arg_names + k: v for k, v in kwargs.items() if k in preprocessor_arg_names } + if "numerical_preprocessing" not in list(preprocessor_kwargs.keys()): + preprocessor_kwargs["numerical_preprocessing"] = "standardization" - if not "numerical_preprocessing" in preprocessor_kwargs.keys(): - preprocessor_kwargs["numerical_preprocessing"] = "normalization" self.preprocessor = Preprocessor(**preprocessor_kwargs) self.model = None + # Raise a warning if task is set to 'classification' + if preprocessor_kwargs.get("task") == "classification": + warnings.warn( + "The task is set to 'classification'. MambularRegressor is designed for regression tasks.", + UserWarning, + ) + def get_params(self, deep=True): """ Get parameters for this estimator. Overrides the BaseEstimator method. @@ -117,8 +202,7 @@ def set_params(self, **parameters): """ # Update config_kwargs with provided parameters valid_config_keys = self.config_kwargs.keys() - config_updates = {k: v for k, - v in parameters.items() if k in valid_config_keys} + config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys} self.config_kwargs.update(config_updates) # Update the config object @@ -189,8 +273,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): data_module : MambularDataModule An instance of MambularDataModule containing the training and validation DataLoaders. """ - train_preprocessed_data = self.preprocessor.fit_transform( - X_train, y_train) + train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train) val_preprocessed_data = self.preprocessor.transform(X_val) # Update feature info based on the actual processed data @@ -210,26 +293,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): cat_key = "cat_" + key # Assuming categorical keys are prefixed with 'cat_' if cat_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long) ) if cat_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long) ) binned_key = "num_" + key # for binned features if binned_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long) ) if binned_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long) ) # Populate tensors for numerical features, if present in processed data @@ -239,13 +318,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): ) # Assuming numerical keys are prefixed with 'num_' if num_key in train_preprocessed_data: train_num_tensors.append( - torch.tensor( - train_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(train_preprocessed_data[num_key], dtype=torch.float) ) if num_key in val_preprocessed_data: val_num_tensors.append( - torch.tensor( - val_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(val_preprocessed_data[num_key], dtype=torch.float) ) train_labels = torch.tensor(y_train, dtype=torch.float) From e71e13676fe550b3483d787d1bad7b1bcb2b532a Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 15:30:39 +0000 Subject: [PATCH 18/21] adjust embedding classifiers to new config --- mambular/base_models/embedding_classifier.py | 225 ++++++++++++------ .../models/sklearn_embedding_classifier.py | 164 +++++++++---- 2 files changed, 274 insertions(+), 115 deletions(-) diff --git a/mambular/base_models/embedding_classifier.py b/mambular/base_models/embedding_classifier.py index ba05649..a5c4551 100644 --- a/mambular/base_models/embedding_classifier.py +++ b/mambular/base_models/embedding_classifier.py @@ -3,8 +3,17 @@ import torch.nn as nn import torchmetrics -from ..utils.config import MambularConfig from ..utils.mamba_arch import Mamba +from ..utils.mlp_utils import MLP +from ..utils.normalization_layers import ( + RMSNorm, + LayerNorm, + LearnableLayerScaling, + BatchNorm, + InstanceNorm, + GroupNorm, +) +from ..utils.configs import DefaultMambularConfig class BaseEmbeddingMambularClassifier(pl.LightningModule): @@ -15,20 +24,14 @@ class BaseEmbeddingMambularClassifier(pl.LightningModule): Parameters ---------- - config : MambularConfig - Configuration parameters for the model architecture. - cat_feature_info : dict, optional - Information about categorical features, mapping feature names to the number of unique categories. - num_feature_info : dict, optional - Information about numerical features, mapping feature names to their number of dimensions after embedding. - lr : float, optional - Learning rate for the optimizer. Defaults to 1e-03. - lr_patience : int, optional - Number of epochs with no improvement after which learning rate will be reduced. Defaults to 10. - weight_decay : float, optional - Weight decay coefficient for regularization in the optimizer. Defaults to 0.025. - lr_factor : float, optional - Factor by which the learning rate will be reduced by the scheduler. Defaults to 0.75. + num_classes : int + number of classes for classification. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + config : DefaultMambularConfig, optional + Configuration object containing default hyperparameters for the model (default is DefaultMambularConfig()). seq_size : int, optional Size of sequence chunks for processing numerical features. Relevant when `raw_embeddings` is False. raw_embeddings : bool, optional @@ -37,52 +40,116 @@ class BaseEmbeddingMambularClassifier(pl.LightningModule): Attributes ---------- + lr : float + Learning rate. + lr_patience : int + Patience for learning rate scheduler. + weight_decay : float + Weight decay for optimizer. + lr_factor : float + Factor by which the learning rate will be reduced. + pooling_method : str + Method to pool the features. + cat_feature_info : dict + Dictionary containing information about categorical features. + num_feature_info : dict + Dictionary containing information about numerical features. + embedding_activation : callable + Activation function for embeddings. mamba : Mamba - The core neural network module implementing the Mamba architecture. + Mamba architecture component. norm_f : nn.Module - Normalization layer applied after the Mamba block. - tabular_head : nn.Linear - Final linear layer mapping the features to the target. + Normalization layer. + num_embeddings : nn.ModuleList + Module list for numerical feature embeddings. + cat_embeddings : nn.ModuleList + Module list for categorical feature embeddings. + tabular_head : MLP + Multi-layer perceptron head for tabular data. + cls_token : nn.Parameter + Class token parameter. + embedding_norm : nn.Module, optional + Layer normalization applied after embedding if specified. + loss_fct : nn.Module + The loss function used for training the model, configured based on the number of classes. + acc : torchmetrics.Accuracy + A metric for computing the accuracy of predictions. + auroc : torchmetrics.AUROC + A metric for computing the Area Under the Receiver Operating Characteristic curve. + precision : torchmetrics.Precision + A metric for computing the precision of predictions. + """ def __init__( self, num_classes, - config: MambularConfig, - cat_feature_info: dict = None, - num_feature_info: dict = None, - lr=1e-03, - lr_patience=10, - weight_decay=0.025, - lr_factor=0.75, + cat_feature_info, + num_feature_info, + config: DefaultMambularConfig = DefaultMambularConfig(), seq_size: int = 20, raw_embeddings=False, + **kwargs, ): super().__init__() - self.config = config self.num_classes = 1 if num_classes == 2 else num_classes - self.lr = lr - self.lr_patience = lr_patience - self.weight_decay = weight_decay - self.lr_factor = lr_factor + # Save all hyperparameters + self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"]) + + # Assigning values from hyperparameters + self.lr = self.hparams.get("lr", config.lr) + self.lr_patience = self.hparams.get("lr_patience", config.lr_patience) + self.weight_decay = self.hparams.get("weight_decay", config.weight_decay) + self.lr_factor = self.hparams.get("lr_factor", config.lr_factor) + self.pooling_method = self.hparams.get("pooling_method", config.pooling_method) self.cat_feature_info = cat_feature_info self.num_feature_info = num_feature_info + + self.embedding_activation = self.hparams.get( + "num_embedding_activation", config.num_embedding_activation + ) self.seq_size = seq_size self.raw_embeddings = raw_embeddings - activations = { - "relu": nn.ReLU(), - "tanh": nn.Tanh(), - "sigmoid": nn.Sigmoid(), - "leaky_relu": nn.LeakyReLU(), - "elu": nn.ELU(), - "selu": nn.SELU(), - "gelu": nn.GELU(), - "softplus": nn.Softplus(), - "leakyrelu": nn.LeakyReLU(), - "linear": nn.Identity(), - } + # Additional layers and components initialization based on hyperparameters + self.mamba = Mamba( + d_model=self.hparams.get("d_model", config.d_model), + n_layers=self.hparams.get("n_layers", config.n_layers), + expand_factor=self.hparams.get("expand_factor", config.expand_factor), + bias=self.hparams.get("bias", config.bias), + d_conv=self.hparams.get("d_conv", config.d_conv), + conv_bias=self.hparams.get("conv_bias", config.conv_bias), + dropout=self.hparams.get("dropout", config.dropout), + dt_rank=self.hparams.get("dt_rank", config.dt_rank), + d_state=self.hparams.get("d_state", config.d_state), + dt_scale=self.hparams.get("dt_scale", config.dt_scale), + dt_init=self.hparams.get("dt_init", config.dt_init), + dt_max=self.hparams.get("dt_max", config.dt_max), + dt_min=self.hparams.get("dt_min", config.dt_min), + dt_init_floor=self.hparams.get("dt_init_floor", config.dt_init_floor), + norm=globals()[self.hparams.get("norm", config.norm)], + activation=self.hparams.get("activation", config.activation), + ) + + # Set the normalization layer dynamically + norm_layer = self.hparams.get("norm", config.norm) + if norm_layer == "RMSNorm": + self.norm_f = RMSNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LayerNorm": + self.norm_f = LayerNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "BatchNorm": + self.norm_f = BatchNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "InstanceNorm": + self.norm_f = InstanceNorm(self.hparams.get("d_model", config.d_model)) + elif norm_layer == "GroupNorm": + self.norm_f = GroupNorm(1, self.hparams.get("d_model", config.d_model)) + elif norm_layer == "LearnableLayerScaling": + self.norm_f = LearnableLayerScaling( + self.hparams.get("d_model", config.d_model) + ) + else: + raise ValueError(f"Unsupported normalization layer: {norm_layer}") if not self.raw_embeddings: data_size = len(num_feature_info.items()) @@ -90,8 +157,11 @@ def __init__( self.num_embeddings = nn.ModuleList( [ nn.Sequential( - nn.Linear(self.seq_size, - self.config.d_model, bias=False), + nn.Linear( + self.seq_size, + self.hparams.get("d_model", config.d_model), + bias=False, + ), # Example using ReLU as the activation function, change as needed self.embedding_activation, ) @@ -104,49 +174,55 @@ def __init__( self.num_embeddings = nn.ModuleList( [ nn.Sequential( - nn.Linear(1, self.config.d_model, bias=False), + nn.Linear( + input_shape, + self.hparams.get("d_model", config.d_model), + bias=False, + ), # Example using ReLU as the activation function, change as needed self.embedding_activation, ) - for _ in range(num_embedding_modules) + for feature_name, input_shape in num_feature_info.items() ] ) self.cat_embeddings = nn.ModuleList( [ - nn.Embedding(num_categories + 1, self.config.d_model) + nn.Embedding( + num_categories + 1, self.hparams.get("d_model", config.d_model) + ) for feature_name, num_categories in cat_feature_info.items() ] ) - self.mamba = Mamba(self.config) - self.norm_f = self.config.norm(self.config.d_model) - mlp_activation_fn = activations.get( - self.config.tabular_head_activation.lower(), nn.Identity() - ) - - # Dynamically create MLP layers based on config.tabular_units - mlp_layers = [] - input_dim = self.config.d_model # Initial input dimension - - # Iterate over the specified units for each layer in the MLP - for units in self.config.tabular_head_units: - mlp_layers.append(nn.Linear(input_dim, units)) - mlp_layers.append(mlp_activation_fn) - mlp_layers.append(nn.Dropout(self.config.tabular_head_dropout)) - input_dim = units + head_activation = self.hparams.get("head_activation", config.head_activation) - # Add the final linear layer to map to a single output value - mlp_layers.append(nn.Linear(input_dim, self.num_classes)) + self.tabular_head = MLP( + self.hparams.get("d_model", config.d_model), + hidden_units_list=self.hparams.get( + "head_layer_sizes", config.head_layer_sizes + ), + dropout_rate=self.hparams.get("head_dropout", config.head_dropout), + use_skip_layers=self.hparams.get( + "head_skip_layers", config.head_skip_layers + ), + activation_fn=head_activation, + use_batch_norm=self.hparams.get( + "head_use_batch_norm", config.head_use_batch_norm + ), + n_output_units=self.num_classes, + ) - # Combine all layers into a Sequential module - self.tabular_head = nn.Sequential(*mlp_layers) + self.cls_token = nn.Parameter( + torch.zeros(1, 1, self.hparams.get("d_model", config.d_model)) + ) - self.pooling_method = self.config.pooling_method - self.cls_token = nn.Parameter(torch.zeros(1, 1, self.config.d_model)) + self.loss_fct = nn.MSELoss() - if self.config.layer_norm_after_embedding: - self.embedding_norm = nn.LayerNorm(self.config.d_model) + if self.hparams.get("layer_norm_after_embedding"): + self.embedding_norm = nn.LayerNorm( + self.hparams.get("d_model", config.d_model) + ) if self.num_classes > 2: self.loss_fct = nn.CrossEntropyLoss() @@ -184,8 +260,7 @@ def forward(self, cat_features, num_features): The output predictions of the model for regression tasks. """ batch_size = ( - cat_features[0].size(0) if cat_features != [ - ] else num_features[0].size(0) + cat_features[0].size(0) if cat_features != [] else num_features[0].size(0) ) cls_tokens = self.cls_token.expand(batch_size, -1, -1) # Process categorical features if present @@ -398,7 +473,7 @@ def configure_optimizers(self): A dictionary containing the optimizer and lr_scheduler configurations. """ optimizer = torch.optim.Adam( - self.parameters(), lr=self.lr, weight_decay=self.config.weight_decay + self.parameters(), lr=self.lr, weight_decay=self.weight_decay ) scheduler = { "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau( diff --git a/mambular/models/sklearn_embedding_classifier.py b/mambular/models/sklearn_embedding_classifier.py index 737549f..3dc8216 100644 --- a/mambular/models/sklearn_embedding_classifier.py +++ b/mambular/models/sklearn_embedding_classifier.py @@ -1,6 +1,8 @@ import lightning as pl import numpy as np import pandas as pd +import warnings + import torch from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint from sklearn.base import BaseEstimator @@ -10,7 +12,7 @@ from torch.utils.data import DataLoader from ..base_models.embedding_classifier import BaseEmbeddingMambularClassifier -from ..utils.config import MambularConfig +from ..utils.configs import DefaultMambularConfig from ..utils.dataset import EmbeddingMambularDataset, MambularDataModule from ..utils.preprocessor import Preprocessor @@ -24,51 +26,141 @@ class EmbeddingMambularClassifier(BaseEstimator): Parameters ---------- - **kwargs : Configuration parameters that can include both MambularConfig settings and preprocessing - options. Any unrecognized parameters are passed to the preprocessor. + # configuration parameters + lr : float, optional + Learning rate for the optimizer. Default is 1e-4. + lr_patience : int, optional + Number of epochs with no improvement on the validation loss to wait before reducing the learning rate. Default is 10. + weight_decay : float, optional + Weight decay (L2 penalty) coefficient. Default is 1e-6. + lr_factor : float, optional + Factor by which the learning rate will be reduced. Default is 0.1. + d_model : int, optional + Dimension of the model. Default is 64. + n_layers : int, optional + Number of layers. Default is 8. + expand_factor : int, optional + Expansion factor. Default is 2. + bias : bool, optional + Whether to use bias. Default is False. + d_conv : int, optional + Dimension of the convolution. Default is 16. + conv_bias : bool, optional + Whether to use bias in the convolution. Default is True. + dropout : float, optional + Dropout rate in the mamba blocks. Default is 0.05. + dt_rank : str, optional + Rank of the time dimension. Default is "auto". + d_state : int, optional + State dimension. Default is 16. + dt_scale : float, optional + Scale of the time dimension. Default is 1.0. + dt_init : str, optional + Initialization method for the time dimension. Default is "random". + dt_max : float, optional + Maximum value for the time dimension. Default is 0.1. + dt_min : float, optional + Minimum value for the time dimension. Default is 1e-3. + dt_init_floor : float, optional + Floor value for the time dimension initialization. Default is 1e-4. + norm : str, optional + Normalization method. Default is 'RMSNorm'. + activation : callable, optional + Activation function. Default is nn.SELU(). + num_embedding_activation : callable, optional + Activation function for numerical embeddings. Default is nn.Identity(). + head_layer_sizes : list, optional + Sizes of the layers in the head. Default is [64, 64, 32]. + head_dropout : float, optional + Dropout rate for the head. Default is 0.5. + head_skip_layers : bool, optional + Whether to use skip layers in the head. Default is False. + head_activation : callable, optional + Activation function for the head. Default is nn.SELU(). + head_use_batch_norm : bool, optional + Whether to use batch normalization in the head. Default is False. + + # Preprocessor Parameters + n_bins : int, optional + The number of bins to use for numerical feature binning. Default is 50. + numerical_preprocessing : str, optional + The preprocessing strategy for numerical features. Default is 'ple'. + use_decision_tree_bins : bool, optional + If True, uses decision tree regression/classification to determine optimal bin edges for numerical feature binning. Default is False. + binning_strategy : str, optional + Defines the strategy for binning numerical features. Default is 'uniform'. + task : str, optional + Indicates the type of machine learning task ('regression' or 'classification'). Default is 'regression'. Attributes ---------- config : MambularConfig - Configuration object for the model, storing architecture-specific parameters. + Configuration object containing model-specific parameters. preprocessor : Preprocessor - Object handling data preprocessing steps such as feature encoding and normalization. - model : ProteinMambularClassifier - The underlying neural network model, instantiated during the `fit` method. + Preprocessor object for data preprocessing steps. + model : BaseEmbeddingMambularRegressor + The neural network model, initialized after the `fit` method is called. """ def __init__(self, **kwargs): # Known config arguments config_arg_names = [ + "lr", + "lr_patience", + "weight_decay", + "lr_factor", "d_model", "n_layers", - "dt_rank", - "output_dimension", - "pooling_method", - "norm", - "cls", - "dt_min", - "dt_max", - "dropout", + "expand_factor", "bias", - "weight_decay", + "d_conv", "conv_bias", + "dropout", + "dt_rank", "d_state", - "expand_factor", - "d_conv", - "dt_init", "dt_scale", + "dt_init", + "dt_max", + "dt_min", "dt_init_floor", + "norm", + "activation", + "num_embedding_activation", + "head_layer_sizes", + "head_dropout", + "head_skip_layers", + "head_activation", + "head_use_batch_norm", ] - config_kwargs = {k: v for k, - v in kwargs.items() if k in config_arg_names} - self.config = MambularConfig(**config_kwargs) - # The rest are assumed to be preprocessor arguments + preprocessor_arg_names = [ + "n_bins", + "numerical_preprocessing", + "use_decision_tree_bins", + "binning_strategy", + "task", + ] + + self.config_kwargs = {k: v for k, v in kwargs.items() if k in config_arg_names} + self.config = DefaultMambularConfig(**self.config_kwargs) + preprocessor_kwargs = { - k: v for k, v in kwargs.items() if k not in config_arg_names + k: v for k, v in kwargs.items() if k in preprocessor_arg_names } + if "numerical_preprocessing" not in list(preprocessor_kwargs.keys()): + preprocessor_kwargs["numerical_preprocessing"] = "standardization" + + # Raise a warning if task is set to 'classification' + if preprocessor_kwargs.get("task") == "regression": + warnings.warn( + "The task is set to 'regression'. This model is designed for classification tasks.", + UserWarning, + ) + + if "task" not in list(preprocessor_kwargs.keys()): + preprocessor_kwargs["task"] = "classification" + self.preprocessor = Preprocessor(**preprocessor_kwargs) self.model = None @@ -117,8 +209,7 @@ def set_params(self, **parameters): """ # Update config_kwargs with provided parameters valid_config_keys = self.config_kwargs.keys() - config_updates = {k: v for k, - v in parameters.items() if k in valid_config_keys} + config_updates = {k: v for k, v in parameters.items() if k in valid_config_keys} self.config_kwargs.update(config_updates) # Update the config object @@ -189,8 +280,7 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): data_module : MambularDataModule An instance of MambularDataModule containing training and validation DataLoaders. """ - train_preprocessed_data = self.preprocessor.fit_transform( - X_train, y_train) + train_preprocessed_data = self.preprocessor.fit_transform(X_train, y_train) val_preprocessed_data = self.preprocessor.transform(X_val) # Update feature info based on the actual processed data @@ -210,26 +300,22 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): cat_key = "cat_" + key # Assuming categorical keys are prefixed with 'cat_' if cat_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[cat_key], dtype=torch.long) ) if cat_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[cat_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[cat_key], dtype=torch.long) ) binned_key = "num_" + key # for binned features if binned_key in train_preprocessed_data: train_cat_tensors.append( - torch.tensor( - train_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(train_preprocessed_data[binned_key], dtype=torch.long) ) if binned_key in val_preprocessed_data: val_cat_tensors.append( - torch.tensor( - val_preprocessed_data[binned_key], dtype=torch.long) + torch.tensor(val_preprocessed_data[binned_key], dtype=torch.long) ) # Populate tensors for numerical features, if present in processed data @@ -239,13 +325,11 @@ def preprocess_data(self, X_train, y_train, X_val, y_val, batch_size, shuffle): ) # Assuming numerical keys are prefixed with 'num_' if num_key in train_preprocessed_data: train_num_tensors.append( - torch.tensor( - train_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(train_preprocessed_data[num_key], dtype=torch.float) ) if num_key in val_preprocessed_data: val_num_tensors.append( - torch.tensor( - val_preprocessed_data[num_key], dtype=torch.float) + torch.tensor(val_preprocessed_data[num_key], dtype=torch.float) ) train_labels = torch.tensor(y_train, dtype=torch.long) From b1d33e2e4a10123ce6177c939988d6c912a356b1 Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 15:50:30 +0000 Subject: [PATCH 19/21] raise notFitted Error when calling transform on unfitted preprocessor --- mambular/utils/preprocessor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mambular/utils/preprocessor.py b/mambular/utils/preprocessor.py index c443ed1..0da3c46 100644 --- a/mambular/utils/preprocessor.py +++ b/mambular/utils/preprocessor.py @@ -13,7 +13,7 @@ from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from .ple_encoding import PLE -from .ple_encoding import PLE +from sklearn.exceptions import NotFittedError class Preprocessor: @@ -277,6 +277,10 @@ def transform(self, X): dict: A dictionary where keys are the names of the features (as per the transformations defined in the column transformer) and the values are numpy arrays of the transformed data. """ + if not self.fitted: + raise NotFittedError( + "The preprocessor must be fitted before transforming new data. Use .fit or .fit_transform" + ) transformed_X = self.column_transformer.transform(X) # Now let's convert this into a dictionary of arrays, one per column From 9fc9eeaad5eaec610e6276099c648aef2df5285a Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 15:50:37 +0000 Subject: [PATCH 20/21] adjust imports in tests --- tests/test_classifier.py | 7 +++---- tests/test_distributions.py | 2 -- tests/test_lss.py | 15 +++++++-------- tests/test_preprocessor.py | 10 +++++----- tests/test_regressor.py | 10 ++++------ 5 files changed, 19 insertions(+), 25 deletions(-) diff --git a/tests/test_classifier.py b/tests/test_classifier.py index 440a1c9..1bda0a4 100644 --- a/tests/test_classifier.py +++ b/tests/test_classifier.py @@ -35,9 +35,9 @@ def tearDown(self): def test_initialization(self): # This assumes MambularConfig is properly imported and used in the MambularRegressor class - from mambular.utils.config import MambularConfig + from mambular.utils.configs import DefaultMambularConfig - self.assertIsInstance(self.classifier.config, MambularConfig) + self.assertIsInstance(self.classifier.config, DefaultMambularConfig) self.assertEqual(self.classifier.config.d_model, 128) self.assertEqual(self.classifier.config.dropout, 0.1) @@ -90,8 +90,7 @@ def test_evaluate(self): axis=1, keepdims=True ) self.classifier.predict = MagicMock(return_value=mock_predictions) - self.classifier.predict_proba = MagicMock( - return_value=mock_probabilities) + self.classifier.predict_proba = MagicMock(return_value=mock_probabilities) # Define metrics to test metrics = { diff --git a/tests/test_distributions.py b/tests/test_distributions.py index 21f969e..a77113f 100644 --- a/tests/test_distributions.py +++ b/tests/test_distributions.py @@ -47,8 +47,6 @@ def test_compute_loss_known_values(self): loc=predictions[:, 0], scale=torch.nn.functional.softplus(predictions[:, 1]) ) expected_loss = -test_dist.log_prob(torch.tensor(0.0)).mean() - print(loss, expected_loss) - self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5) def test_evaluate_nll(self): diff --git a/tests/test_lss.py b/tests/test_lss.py index f621cdf..9da647f 100644 --- a/tests/test_lss.py +++ b/tests/test_lss.py @@ -4,8 +4,9 @@ import numpy as np import pandas as pd import torch -from properscoring import \ - crps_gaussian # Assuming this is the source of the CRPS function +from properscoring import ( + crps_gaussian, +) # Assuming this is the source of the CRPS function from sklearn.metrics import mean_poisson_deviance, mean_squared_error from mambular.models import MambularLSS # Update the import path @@ -40,9 +41,9 @@ def tearDown(self): self.patcher_base_model.stop() def test_initialization(self): - from mambular.utils.config import MambularConfig + from mambular.utils.configs import DefaultMambularConfig - self.assertIsInstance(self.model.config, MambularConfig) + self.assertIsInstance(self.model.config, DefaultMambularConfig) self.assertEqual(self.model.config.d_model, 128) self.assertEqual(self.model.config.dropout, 0.1) self.assertEqual(self.model.config.n_layers, 4) @@ -91,8 +92,7 @@ def test_normal_metrics(self): "MSE": lambda y, pred: mean_squared_error(y, pred[:, 0]), "CRPS": lambda y, pred: np.mean( [ - crps_gaussian(y[i], mu=pred[i, 0], - sig=np.sqrt(pred[i, 1])) + crps_gaussian(y[i], mu=pred[i, 0], sig=np.sqrt(pred[i, 1])) for i in range(len(y)) ] ), @@ -124,8 +124,7 @@ def test_poisson_metrics(self): ) self.assertIn("Poisson Deviance", results) # Optionally calculate expected deviance and check - expected_deviance = mean_poisson_deviance( - self.y_test, mock_predictions) + expected_deviance = mean_poisson_deviance(self.y_test, mock_predictions) self.assertAlmostEqual(results["Poisson Deviance"], expected_deviance) diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py index fb43c64..7cf4f84 100644 --- a/tests/test_preprocessor.py +++ b/tests/test_preprocessor.py @@ -20,7 +20,7 @@ def setUp(self): def test_initialization(self): """Test initialization of the Preprocessor with default parameters.""" - pp = Preprocessor(n_bins=20) + pp = Preprocessor(n_bins=20, numerical_preprocessing="binning") self.assertEqual(pp.n_bins, 20) self.assertEqual(pp.numerical_preprocessing, "binning") self.assertFalse(pp.use_decision_tree_bins) @@ -28,7 +28,7 @@ def test_initialization(self): def test_fit(self): """Test the fitting process of the preprocessor.""" pp = Preprocessor(numerical_preprocessing="binning", n_bins=20) - pp.fit(self.data) + pp.fit(self.data, self.target) self.assertIsNotNone(pp.column_transformer) def test_transform_not_fitted(self): @@ -40,7 +40,7 @@ def test_transform_not_fitted(self): def test_fit_transform(self): """Test fitting and transforming the data.""" pp = Preprocessor(numerical_preprocessing="standardization") - transformed_data = pp.fit_transform(self.data) + transformed_data = pp.fit_transform(self.data, self.target) self.assertIsInstance(transformed_data, dict) self.assertTrue("num_numerical" in transformed_data) self.assertTrue("cat_categorical" in transformed_data) @@ -48,7 +48,7 @@ def test_fit_transform(self): def test_ple(self): """Test fitting and transforming the data.""" pp = Preprocessor(numerical_preprocessing="ple", n_bins=20) - transformed_data = pp.fit_transform(self.data) + transformed_data = pp.fit_transform(self.data, self.target) self.assertIsInstance(transformed_data, dict) self.assertTrue("num_numerical" in transformed_data) self.assertTrue("cat_categorical" in transformed_data) @@ -59,7 +59,7 @@ def test_transform_with_missing_values(self): data_with_missing.loc[0, "numerical"] = np.nan data_with_missing.loc[1, "categorical"] = np.nan pp = Preprocessor(numerical_preprocessing="normalization") - transformed_data = pp.fit_transform(data_with_missing) + transformed_data = pp.fit_transform(data_with_missing, self.target) self.assertNotIn(np.nan, transformed_data["num_numerical"]) self.assertNotIn(np.nan, transformed_data["cat_categorical"]) diff --git a/tests/test_regressor.py b/tests/test_regressor.py index ab405b3..8daff5d 100644 --- a/tests/test_regressor.py +++ b/tests/test_regressor.py @@ -34,9 +34,9 @@ def tearDown(self): def test_initialization(self): # This assumes MambularConfig is properly imported and used in the MambularRegressor class - from mambular.utils.config import MambularConfig + from mambular.utils.configs import DefaultMambularConfig - self.assertIsInstance(self.regressor.config, MambularConfig) + self.assertIsInstance(self.regressor.config, DefaultMambularConfig) self.assertEqual(self.regressor.config.d_model, 128) self.assertEqual(self.regressor.config.dropout, 0.1) @@ -65,8 +65,7 @@ def test_predict(self): # Create mock return objects that mimic tensor behavior mock_prediction = MagicMock() mock_prediction.cpu.return_value = MagicMock() - mock_prediction.cpu.return_value.numpy.return_value = np.array([ - 0.5] * 100) + mock_prediction.cpu.return_value.numpy.return_value = np.array([0.5] * 100) # Mock the model and its method calls self.regressor.model = MagicMock() @@ -87,8 +86,7 @@ def test_evaluate(self): self.regressor.predict = MagicMock(return_value=mock_predictions) # Define metrics to test - metrics = {"Mean Squared Error": mean_squared_error, - "R2 Score": r2_score} + metrics = {"Mean Squared Error": mean_squared_error, "R2 Score": r2_score} # Call evaluate with the defined metrics result = self.regressor.evaluate(self.X, self.y, metrics=metrics) From bbf07c93519e3ef087690910834249ee183c1d6c Mon Sep 17 00:00:00 2001 From: thielmaf Date: Wed, 29 May 2024 15:51:35 +0000 Subject: [PATCH 21/21] delete old config --- mambular/utils/config.py | 150 --------------------------------------- 1 file changed, 150 deletions(-) delete mode 100644 mambular/utils/config.py diff --git a/mambular/utils/config.py b/mambular/utils/config.py deleted file mode 100644 index 3757d5b..0000000 --- a/mambular/utils/config.py +++ /dev/null @@ -1,150 +0,0 @@ -from dataclasses import dataclass, asdict, field -import json -import os -import math -from typing import Union, Type, List -from .normalization_layers import ( - RMSNorm, - LayerNorm, - LearnableLayerScaling, - BatchNorm, - InstanceNorm, - GroupNorm, -) -import torch.nn as nn - - -@dataclass -class MambularConfig: - """ - A configuration class specific to the Mambular model. - Handles Mamba-specific hyperparameters as well as vocabulary size and output dimensions. - - Attributes: - d_model (int): The dimensionality of the input and output tensors. - n_layers (int): The number of MambaBlocks in the model. - dt_rank (Union[int, str]): The rank of the dynamical time tensor. - Can be an integer or 'auto' to calculate automatically based on d_model. - d_state (int): The dimensionality of the state tensor. - expand_factor (int): The factor by which the inner dimensionality is expanded. - d_conv (int): The dimensionality of the convolutional layer. - - dt_min (float): The minimum value for dynamical time. - dt_max (float): The maximum value for dynamical time. - dt_init (str): The initialization method for dynamical time. Either 'constant' or 'random'. - dt_scale (float): The scale factor for dynamical time initialization. - dt_init_floor (float): The floor value for dynamical time initialization. - - dropout (float): The dropout probability. - bias (bool): Whether to include bias in linear layers. - weight_decay (float): weight decay in optimizer. - conv_bias (bool): Whether to include bias in the convolutional layer. - vocab_size (list): The sizes of the vocabulary for the features used by the Mambular model. - output_dimension (int): The dimensionality of the output layer. - pooling_method (str): The pooling method for combining token embeddings. - Options: 'avg', 'max', 'sum', 'cls_token'. - norm (nn.Module): The normalization layer to use. - Options: RMSNorm, LayerNorm, LearnableLayerScaling, BatchNorm, InstanceNorm, GroupNorm. - - Methods: - __post_init__(): Performs additional initialization steps or checks after instance creation. - save_pretrained(save_directory: str): Saves the configuration to a JSON file. - - Raises: - ValueError: If invalid values are provided for pooling method or normalization layer. - """ - - VALID_POOLING_METHODS = ["avg", "max", "sum", "cls_token"] - - VALID_NORMALIZATION_LAYERS = { - "RMSNorm": RMSNorm, - "LayerNorm": LayerNorm, - "LearnableLayerScaling": LearnableLayerScaling, - "BatchNorm": BatchNorm, - "InstanceNorm": InstanceNorm, - "GroupNorm": GroupNorm, - } - - d_model: int = 64 - n_layers: int = 6 - dt_rank: Union[int, str] = "auto" - d_state: int = 32 - expand_factor: int = 2 - d_conv: int = 8 - - dt_min: float = 0.001 - dt_max: float = 0.1 - dt_init: str = "random" - dt_scale: float = 1.0 - dt_init_floor: float = 1e-4 - dropout: float = 0.05 - - bias: bool = False - weight_decay: float = 0.025 - conv_bias: bool = True - output_dimension: int = 1 - pooling_method: str = "avg" - norm: Union[str, Type[nn.Module]] = RMSNorm - num_embedding_activation: str = "linear" - tabular_head_units: list = field(default_factory=lambda: [128, 64, 64]) - tabular_head_activation: str = "relu" - tabular_head_dropout: float = 0.3 - layer_norm_after_embedding: bool = True - - def __post_init__(self): - """ - Called automatically after the initialization of MambularConfig instances. - Performs additional initialization steps or checks, if required. - """ - self.d_inner = self.expand_factor * self.d_model - - if self.dt_rank == "auto": - self.dt_rank = math.ceil(self.d_model / 16) - - # Check if the provided pooling method is valid - if self.pooling_method not in self.VALID_POOLING_METHODS: - raise ValueError( - f"Invalid pooling method: {self.pooling_method}. " - f"Valid options are: {', '.join(self.VALID_POOLING_METHODS)}" - ) - - # Check if the provided normalization layer is valid - if ( - isinstance(self.norm, type) - and self.norm.__name__ not in self.VALID_NORMALIZATION_LAYERS - ): - raise ValueError( - f"Invalid normalization layer: {self.norm.__name__}. " - f"Valid options are: {', '.join(self.VALID_NORMALIZATION_LAYERS.keys())}" - ) - elif ( - isinstance(self.norm, str) - and self.norm not in self.VALID_NORMALIZATION_LAYERS - ): - raise ValueError( - f"Invalid normalization layer: {self.norm}. " - f"Valid options are: {', '.join(self.VALID_NORMALIZATION_LAYERS.keys())}" - ) - - def save_pretrained(self, save_directory: str): - """ - Saves the configuration parameters of the MambularConfig instance to a JSON file - in the specified directory. This is useful for model persistence, reproducibility, - or reloading the model configuration in the future. - - Parameters: - save_directory (str): The directory path where the configuration JSON file will be saved. - - Returns: - None: The method prints the path where the configuration is saved but does not return any value. - """ - os.makedirs(save_directory, exist_ok=True) - # Define the configuration file path - config_file = os.path.join(save_directory, "config.json") - - # Convert the dataclass to a dictionary and then to a JSON string - config_dict = asdict(self) - with open(config_file, "w") as f: - json.dump(config_dict, f, indent=4) - - print(f"Configuration saved in {config_file}")