diff --git a/examples/benchmark_lss.py b/examples/benchmark_lss.py deleted file mode 100644 index c84362e..0000000 --- a/examples/benchmark_lss.py +++ /dev/null @@ -1,70 +0,0 @@ -from mambular.models import MambularLSS -from sklearn.preprocessing import StandardScaler -import pandas as pd -import numpy as np -import properscoring as ps -from sklearn.model_selection import KFold - -datasets = ["regression_datasets/abalone.csv", "regression_datasets/ca_housing.csv"] - -crps = lambda y, pred: np.mean( - [ - ps.crps_gaussian(y[i], mu=pred[i, 0], sig=np.sqrt(pred[i, 1])) - for i in range(len(y)) - ] -) - -kf = KFold(n_splits=5, shuffle=True, random_state=42) - - -# Function to compute NLL -def compute_nll(y, pred): - means = pred[:, 0] - variances = pred[:, 1] - nll = 0.5 * (np.log(2 * np.pi * variances) + ((y - means) ** 2) / variances) - return np.mean(nll) - - -results = [] - -for dataset_name in datasets: - data = pd.read_csv(dataset_name) - data = data.dropna().reset_index(drop=True) - y_data = data.pop("Targets") - scaler = StandardScaler() - y_data = scaler.fit_transform(y_data.values.reshape(-1, 1)).squeeze(-1) - - crps_vals = [] - nll_vals = [] - mse_vals = [] - - for fold, (train_index, val_index) in enumerate(kf.split(data)): - X_train, X_test = data.iloc[train_index], data.iloc[val_index] - y_train, y_test = y_data[train_index], y_data[val_index] - - model = MambularLSS() - - model.fit(X_train, y_train, family="normal", max_epochs=200, lr=5e-04) - - print(model.evaluate(X_test, y_test)) - - predictions = model.predict(X_test) - - crps_vals.append(crps(y_test, predictions)) - nll_vals.append(compute_nll(y_test, predictions)) - mse_vals.append(model.evaluate(X_test, y_test)["MSE"]) - - results.append( - { - "Dataset": dataset_name, - "Mean CRPS": np.mean(crps_vals), - "Std CRPS": np.std(crps_vals), - "Mean NLL": np.mean(nll_vals), - "Std NLL": np.std(nll_vals), - "MSE": np.mean(mse_vals), - } - ) - -results_df = pd.DataFrame(results) - -print(results_df) diff --git a/examples/benchmark_mambularlss.py b/examples/benchmark_mambularlss.py new file mode 100644 index 0000000..bdab406 --- /dev/null +++ b/examples/benchmark_mambularlss.py @@ -0,0 +1,82 @@ +from datetime import datetime + +import numpy as np +import pandas as pd +from loguru import logger +from sklearn.model_selection import KFold +from sklearn.preprocessing import StandardScaler +from utils import compute_nll, crps, timeit + +from mambular.models import MambularLSS + +time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") +logger.add(f"mambular_lss_{time}.log", backtrace=True, diagnose=True) + + +# Datasets +datasets = ["regression_datasets/abalone.csv", + "regression_datasets/ca_housing.csv"] + +SEED = 42 + + +@timeit +def main(): + results = [] + kf = KFold(n_splits=5, shuffle=True, random_state=SEED) + + for dataset_path in datasets: + if "abalone" in dataset_path: + dataset = "abalone" + elif "ca_housing" in dataset_path: + dataset = "ca_housing" + else: + dataset = "unknown" + logger.info(f"Processing {dataset}") + data = pd.read_csv(dataset_path) + data = data.dropna().reset_index(drop=True) + y_data = data.pop("Targets") + scaler = StandardScaler() + y_data = scaler.fit_transform(y_data.values.reshape(-1, 1)).squeeze(-1) + logger.info(f"Data shape: {data.shape}") + logger.info(f"y_data shape: {y_data.shape}") + + crps_vals = [] + nll_vals = [] + mse_vals = [] + + for fold, (train_index, val_index) in enumerate(kf.split(data)): + logger.info(f"Started fold: {fold}") + X_train, X_test = data.iloc[train_index], data.iloc[val_index] + y_train, y_test = y_data[train_index], y_data[val_index] + + model = MambularLSS() + model.fit(X_train, y_train, family="normal", + max_epochs=100, lr=5e-04) + + # print(model.evaluate(X_test, y_test)) + predictions = model.predict(X_test) + + crps_vals.append(crps(y_test, predictions)) + nll_vals.append(compute_nll(y_test, predictions)) + mse_vals.append(model.evaluate(X_test, y_test)["MSE"]) + logger.info(f"Completed fold: {fold}") + + results.append( + { + "Dataset": dataset, + "Mean CRPS": np.mean(crps_vals), + "Std CRPS": np.std(crps_vals), + "Mean NLL": np.mean(nll_vals), + "Std NLL": np.std(nll_vals), + "MSE": np.mean(mse_vals), + } + ) + logger.info("Finished all datasets") + logger.info(results) + results_df = pd.DataFrame(results) + results_df.to_csv(f"result_mambularlss.csv", index=False) + + +if __name__ == "__main__": + main() diff --git a/examples/benchmark_xgboostlss.py b/examples/benchmark_xgboostlss.py new file mode 100644 index 0000000..d330203 --- /dev/null +++ b/examples/benchmark_xgboostlss.py @@ -0,0 +1,122 @@ +import multiprocessing as mp +from datetime import datetime + +import numpy as np +import pandas as pd +import xgboost as xgb +from loguru import logger +from sklearn.metrics import mean_squared_error as mse +from sklearn.model_selection import KFold +from sklearn.preprocessing import OrdinalEncoder, StandardScaler +from utils import compute_nll, crps, timeit +from xgboostlss.distributions.Gaussian import Gaussian +from xgboostlss.model import XGBoostLSS + +time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") +logger.add(f"xgboost_lss_{time}.log", backtrace=True, diagnose=True) + + +# Datasets +datasets = ["regression_datasets/abalone.csv", + "regression_datasets/ca_housing.csv"] + +SEED = 42 +N_CPU = mp.cpu_count() + + +@timeit +def main(): + results = [] + kf = KFold(n_splits=5, shuffle=True, random_state=SEED) + + for dataset_path in datasets: + if "abalone" in dataset_path: + dataset = "abalone" + elif "ca_housing" in dataset_path: + dataset = "ca_housing" + else: + dataset = "unknown" + + logger.info(f"Processing {dataset}") + data = pd.read_csv(dataset_path) + data = data.dropna().reset_index(drop=True) + y_data = data.pop("Targets") + + scaler = StandardScaler() + y_data = scaler.fit_transform(y_data.values.reshape(-1, 1)).squeeze(-1) + logger.info(f"Data shape: {data.shape}") + logger.info(f"y_data shape: {y_data.shape}") + + crps_vals = [] + nll_vals = [] + mse_vals = [] + + for fold, (train_index, val_index) in enumerate(kf.split(data)): + logger.info(f"Started fold: {fold}") + X_train, X_test = data.iloc[train_index], data.iloc[val_index] + y_train, y_test = y_data[train_index], y_data[val_index] + encoder = OrdinalEncoder() + if dataset == 'abalone': + # For 'abalone' dataset + X_train.loc[:, "Sex"] = encoder.fit_transform(X_train[["Sex"]]) + X_test.loc[:, "Sex"] = encoder.transform(X_test[["Sex"]]) + elif dataset == 'ca_housing': + # For 'ca_housing' dataset + X_train.loc[:, "ocean_proximity"] = encoder.fit_transform( + X_train[["ocean_proximity"]]) + X_test.loc[:, "ocean_proximity"] = encoder.transform( + X_test[["ocean_proximity"]]) + else: + pass + + dtrain = xgb.DMatrix(X_train.values, label=y_train, + nthread=N_CPU, enable_categorical=False) + dtest = xgb.DMatrix(X_test.values, nthread=N_CPU, + enable_categorical=False) + model = XGBoostLSS(Gaussian(stabilization="None", + response_fn="exp", + loss_fn="nll" + )) + + # default xgboost parameters + # as defatult parameter for xgboost lss is same as xgboost + default_param = { + "eta": 0.3, + "max_depth": 6, + "gamma": 0, + "subsample": 1, + "colsample_bytree": 1, + "min_child_weight": 1, + "booster": "gbtree", + } + # Train Model with optimized hyperparameters + model.train(default_param, + dtrain, + num_boost_round=100 + ) + + # Return predicted distributional parameters, i.e. mean and variance + predictions = model.predict(dtest, pred_type="parameters") + + crps_vals.append(crps(y_test, predictions.values)) + nll_vals.append(compute_nll(y_test, predictions.values)) + mse_vals.append(mse(y_test, predictions.values[:, 0])) + + results.append( + { + "Dataset": dataset, + "Mean CRPS": np.mean(crps_vals), + "Std CRPS": np.std(crps_vals), + "Mean NLL": np.mean(nll_vals), + "Std NLL": np.std(nll_vals), + "MSE": np.mean(mse_vals), + } + ) + logger.info("Finished all datasets") + logger.info(results) + results_df = pd.DataFrame(results) + results_df.to_csv(f"result_xgboostlss.csv", index=False) + + +if __name__ == "__main__": + main() diff --git a/examples/utils.py b/examples/utils.py new file mode 100644 index 0000000..b0769ed --- /dev/null +++ b/examples/utils.py @@ -0,0 +1,32 @@ +from datetime import datetime + +import numpy as np +import properscoring as ps +from loguru import logger + + +def crps(y, pred): + result = np.mean([ps.crps_gaussian(y[i], mu=pred[i, 0], + sig=np.sqrt(pred[i, 1])) for i in range(len(y))]) + return result + + +# Function to compute NLL +def compute_nll(y, pred): + means = pred[:, 0] + variances = pred[:, 1] + nll = 0.5 * (np.log(2 * np.pi * variances) + + ((y - means) ** 2) / variances) + return np.mean(nll) + + +def timeit(method): + def timed(*args, **kw): + ts = datetime.now() + logger.info(f"Started {method.__name__}") + result = method(*args, **kw) + te = datetime.now() + logger.info(f"Finished {method.__name__}") + logger.info(f"{method.__name__} took: {(te - ts)/60} minutes") + return result + return timed