Skip to content

Commit

Permalink
lss benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
mkumar73 committed Jul 25, 2024
1 parent 29244c8 commit 7f11cc9
Show file tree
Hide file tree
Showing 4 changed files with 236 additions and 70 deletions.
70 changes: 0 additions & 70 deletions examples/benchmark_lss.py

This file was deleted.

82 changes: 82 additions & 0 deletions examples/benchmark_mambularlss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from datetime import datetime

import numpy as np
import pandas as pd
from loguru import logger
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from utils import compute_nll, crps, timeit

from mambular.models import MambularLSS

time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logger.add(f"mambular_lss_{time}.log", backtrace=True, diagnose=True)


# Datasets
datasets = ["regression_datasets/abalone.csv",
"regression_datasets/ca_housing.csv"]

SEED = 42


@timeit
def main():
results = []
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

for dataset_path in datasets:
if "abalone" in dataset_path:
dataset = "abalone"
elif "ca_housing" in dataset_path:
dataset = "ca_housing"
else:
dataset = "unknown"
logger.info(f"Processing {dataset}")
data = pd.read_csv(dataset_path)
data = data.dropna().reset_index(drop=True)
y_data = data.pop("Targets")
scaler = StandardScaler()
y_data = scaler.fit_transform(y_data.values.reshape(-1, 1)).squeeze(-1)
logger.info(f"Data shape: {data.shape}")
logger.info(f"y_data shape: {y_data.shape}")

crps_vals = []
nll_vals = []
mse_vals = []

for fold, (train_index, val_index) in enumerate(kf.split(data)):
logger.info(f"Started fold: {fold}")
X_train, X_test = data.iloc[train_index], data.iloc[val_index]
y_train, y_test = y_data[train_index], y_data[val_index]

model = MambularLSS()
model.fit(X_train, y_train, family="normal",
max_epochs=100, lr=5e-04)

# print(model.evaluate(X_test, y_test))
predictions = model.predict(X_test)

crps_vals.append(crps(y_test, predictions))
nll_vals.append(compute_nll(y_test, predictions))
mse_vals.append(model.evaluate(X_test, y_test)["MSE"])
logger.info(f"Completed fold: {fold}")

results.append(
{
"Dataset": dataset,
"Mean CRPS": np.mean(crps_vals),
"Std CRPS": np.std(crps_vals),
"Mean NLL": np.mean(nll_vals),
"Std NLL": np.std(nll_vals),
"MSE": np.mean(mse_vals),
}
)
logger.info("Finished all datasets")
logger.info(results)
results_df = pd.DataFrame(results)
results_df.to_csv(f"result_mambularlss.csv", index=False)


if __name__ == "__main__":
main()
122 changes: 122 additions & 0 deletions examples/benchmark_xgboostlss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import multiprocessing as mp
from datetime import datetime

import numpy as np
import pandas as pd
import xgboost as xgb
from loguru import logger
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from utils import compute_nll, crps, timeit
from xgboostlss.distributions.Gaussian import Gaussian
from xgboostlss.model import XGBoostLSS

time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
logger.add(f"xgboost_lss_{time}.log", backtrace=True, diagnose=True)


# Datasets
datasets = ["regression_datasets/abalone.csv",
"regression_datasets/ca_housing.csv"]

SEED = 42
N_CPU = mp.cpu_count()


@timeit
def main():
results = []
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

for dataset_path in datasets:
if "abalone" in dataset_path:
dataset = "abalone"
elif "ca_housing" in dataset_path:
dataset = "ca_housing"
else:
dataset = "unknown"

logger.info(f"Processing {dataset}")
data = pd.read_csv(dataset_path)
data = data.dropna().reset_index(drop=True)
y_data = data.pop("Targets")

scaler = StandardScaler()
y_data = scaler.fit_transform(y_data.values.reshape(-1, 1)).squeeze(-1)
logger.info(f"Data shape: {data.shape}")
logger.info(f"y_data shape: {y_data.shape}")

crps_vals = []
nll_vals = []
mse_vals = []

for fold, (train_index, val_index) in enumerate(kf.split(data)):
logger.info(f"Started fold: {fold}")
X_train, X_test = data.iloc[train_index], data.iloc[val_index]
y_train, y_test = y_data[train_index], y_data[val_index]
encoder = OrdinalEncoder()
if dataset == 'abalone':
# For 'abalone' dataset
X_train.loc[:, "Sex"] = encoder.fit_transform(X_train[["Sex"]])
X_test.loc[:, "Sex"] = encoder.transform(X_test[["Sex"]])
elif dataset == 'ca_housing':
# For 'ca_housing' dataset
X_train.loc[:, "ocean_proximity"] = encoder.fit_transform(
X_train[["ocean_proximity"]])
X_test.loc[:, "ocean_proximity"] = encoder.transform(
X_test[["ocean_proximity"]])
else:
pass

dtrain = xgb.DMatrix(X_train.values, label=y_train,
nthread=N_CPU, enable_categorical=False)
dtest = xgb.DMatrix(X_test.values, nthread=N_CPU,
enable_categorical=False)
model = XGBoostLSS(Gaussian(stabilization="None",
response_fn="exp",
loss_fn="nll"
))

# default xgboost parameters
# as defatult parameter for xgboost lss is same as xgboost
default_param = {
"eta": 0.3,
"max_depth": 6,
"gamma": 0,
"subsample": 1,
"colsample_bytree": 1,
"min_child_weight": 1,
"booster": "gbtree",
}
# Train Model with optimized hyperparameters
model.train(default_param,
dtrain,
num_boost_round=100
)

# Return predicted distributional parameters, i.e. mean and variance
predictions = model.predict(dtest, pred_type="parameters")

crps_vals.append(crps(y_test, predictions.values))
nll_vals.append(compute_nll(y_test, predictions.values))
mse_vals.append(mse(y_test, predictions.values[:, 0]))

results.append(
{
"Dataset": dataset,
"Mean CRPS": np.mean(crps_vals),
"Std CRPS": np.std(crps_vals),
"Mean NLL": np.mean(nll_vals),
"Std NLL": np.std(nll_vals),
"MSE": np.mean(mse_vals),
}
)
logger.info("Finished all datasets")
logger.info(results)
results_df = pd.DataFrame(results)
results_df.to_csv(f"result_xgboostlss.csv", index=False)


if __name__ == "__main__":
main()
32 changes: 32 additions & 0 deletions examples/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from datetime import datetime

import numpy as np
import properscoring as ps
from loguru import logger


def crps(y, pred):
result = np.mean([ps.crps_gaussian(y[i], mu=pred[i, 0],
sig=np.sqrt(pred[i, 1])) for i in range(len(y))])
return result


# Function to compute NLL
def compute_nll(y, pred):
means = pred[:, 0]
variances = pred[:, 1]
nll = 0.5 * (np.log(2 * np.pi * variances) +
((y - means) ** 2) / variances)
return np.mean(nll)


def timeit(method):
def timed(*args, **kw):
ts = datetime.now()
logger.info(f"Started {method.__name__}")
result = method(*args, **kw)
te = datetime.now()
logger.info(f"Finished {method.__name__}")
logger.info(f"{method.__name__} took: {(te - ts)/60} minutes")
return result
return timed

0 comments on commit 7f11cc9

Please sign in to comment.