-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
236 additions
and
70 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from datetime import datetime | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from loguru import logger | ||
from sklearn.model_selection import KFold | ||
from sklearn.preprocessing import StandardScaler | ||
from utils import compute_nll, crps, timeit | ||
|
||
from mambular.models import MambularLSS | ||
|
||
time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | ||
logger.add(f"mambular_lss_{time}.log", backtrace=True, diagnose=True) | ||
|
||
|
||
# Datasets | ||
datasets = ["regression_datasets/abalone.csv", | ||
"regression_datasets/ca_housing.csv"] | ||
|
||
SEED = 42 | ||
|
||
|
||
@timeit | ||
def main(): | ||
results = [] | ||
kf = KFold(n_splits=5, shuffle=True, random_state=SEED) | ||
|
||
for dataset_path in datasets: | ||
if "abalone" in dataset_path: | ||
dataset = "abalone" | ||
elif "ca_housing" in dataset_path: | ||
dataset = "ca_housing" | ||
else: | ||
dataset = "unknown" | ||
logger.info(f"Processing {dataset}") | ||
data = pd.read_csv(dataset_path) | ||
data = data.dropna().reset_index(drop=True) | ||
y_data = data.pop("Targets") | ||
scaler = StandardScaler() | ||
y_data = scaler.fit_transform(y_data.values.reshape(-1, 1)).squeeze(-1) | ||
logger.info(f"Data shape: {data.shape}") | ||
logger.info(f"y_data shape: {y_data.shape}") | ||
|
||
crps_vals = [] | ||
nll_vals = [] | ||
mse_vals = [] | ||
|
||
for fold, (train_index, val_index) in enumerate(kf.split(data)): | ||
logger.info(f"Started fold: {fold}") | ||
X_train, X_test = data.iloc[train_index], data.iloc[val_index] | ||
y_train, y_test = y_data[train_index], y_data[val_index] | ||
|
||
model = MambularLSS() | ||
model.fit(X_train, y_train, family="normal", | ||
max_epochs=100, lr=5e-04) | ||
|
||
# print(model.evaluate(X_test, y_test)) | ||
predictions = model.predict(X_test) | ||
|
||
crps_vals.append(crps(y_test, predictions)) | ||
nll_vals.append(compute_nll(y_test, predictions)) | ||
mse_vals.append(model.evaluate(X_test, y_test)["MSE"]) | ||
logger.info(f"Completed fold: {fold}") | ||
|
||
results.append( | ||
{ | ||
"Dataset": dataset, | ||
"Mean CRPS": np.mean(crps_vals), | ||
"Std CRPS": np.std(crps_vals), | ||
"Mean NLL": np.mean(nll_vals), | ||
"Std NLL": np.std(nll_vals), | ||
"MSE": np.mean(mse_vals), | ||
} | ||
) | ||
logger.info("Finished all datasets") | ||
logger.info(results) | ||
results_df = pd.DataFrame(results) | ||
results_df.to_csv(f"result_mambularlss.csv", index=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
import multiprocessing as mp | ||
from datetime import datetime | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import xgboost as xgb | ||
from loguru import logger | ||
from sklearn.metrics import mean_squared_error as mse | ||
from sklearn.model_selection import KFold | ||
from sklearn.preprocessing import OrdinalEncoder, StandardScaler | ||
from utils import compute_nll, crps, timeit | ||
from xgboostlss.distributions.Gaussian import Gaussian | ||
from xgboostlss.model import XGBoostLSS | ||
|
||
time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | ||
logger.add(f"xgboost_lss_{time}.log", backtrace=True, diagnose=True) | ||
|
||
|
||
# Datasets | ||
datasets = ["regression_datasets/abalone.csv", | ||
"regression_datasets/ca_housing.csv"] | ||
|
||
SEED = 42 | ||
N_CPU = mp.cpu_count() | ||
|
||
|
||
@timeit | ||
def main(): | ||
results = [] | ||
kf = KFold(n_splits=5, shuffle=True, random_state=SEED) | ||
|
||
for dataset_path in datasets: | ||
if "abalone" in dataset_path: | ||
dataset = "abalone" | ||
elif "ca_housing" in dataset_path: | ||
dataset = "ca_housing" | ||
else: | ||
dataset = "unknown" | ||
|
||
logger.info(f"Processing {dataset}") | ||
data = pd.read_csv(dataset_path) | ||
data = data.dropna().reset_index(drop=True) | ||
y_data = data.pop("Targets") | ||
|
||
scaler = StandardScaler() | ||
y_data = scaler.fit_transform(y_data.values.reshape(-1, 1)).squeeze(-1) | ||
logger.info(f"Data shape: {data.shape}") | ||
logger.info(f"y_data shape: {y_data.shape}") | ||
|
||
crps_vals = [] | ||
nll_vals = [] | ||
mse_vals = [] | ||
|
||
for fold, (train_index, val_index) in enumerate(kf.split(data)): | ||
logger.info(f"Started fold: {fold}") | ||
X_train, X_test = data.iloc[train_index], data.iloc[val_index] | ||
y_train, y_test = y_data[train_index], y_data[val_index] | ||
encoder = OrdinalEncoder() | ||
if dataset == 'abalone': | ||
# For 'abalone' dataset | ||
X_train.loc[:, "Sex"] = encoder.fit_transform(X_train[["Sex"]]) | ||
X_test.loc[:, "Sex"] = encoder.transform(X_test[["Sex"]]) | ||
elif dataset == 'ca_housing': | ||
# For 'ca_housing' dataset | ||
X_train.loc[:, "ocean_proximity"] = encoder.fit_transform( | ||
X_train[["ocean_proximity"]]) | ||
X_test.loc[:, "ocean_proximity"] = encoder.transform( | ||
X_test[["ocean_proximity"]]) | ||
else: | ||
pass | ||
|
||
dtrain = xgb.DMatrix(X_train.values, label=y_train, | ||
nthread=N_CPU, enable_categorical=False) | ||
dtest = xgb.DMatrix(X_test.values, nthread=N_CPU, | ||
enable_categorical=False) | ||
model = XGBoostLSS(Gaussian(stabilization="None", | ||
response_fn="exp", | ||
loss_fn="nll" | ||
)) | ||
|
||
# default xgboost parameters | ||
# as defatult parameter for xgboost lss is same as xgboost | ||
default_param = { | ||
"eta": 0.3, | ||
"max_depth": 6, | ||
"gamma": 0, | ||
"subsample": 1, | ||
"colsample_bytree": 1, | ||
"min_child_weight": 1, | ||
"booster": "gbtree", | ||
} | ||
# Train Model with optimized hyperparameters | ||
model.train(default_param, | ||
dtrain, | ||
num_boost_round=100 | ||
) | ||
|
||
# Return predicted distributional parameters, i.e. mean and variance | ||
predictions = model.predict(dtest, pred_type="parameters") | ||
|
||
crps_vals.append(crps(y_test, predictions.values)) | ||
nll_vals.append(compute_nll(y_test, predictions.values)) | ||
mse_vals.append(mse(y_test, predictions.values[:, 0])) | ||
|
||
results.append( | ||
{ | ||
"Dataset": dataset, | ||
"Mean CRPS": np.mean(crps_vals), | ||
"Std CRPS": np.std(crps_vals), | ||
"Mean NLL": np.mean(nll_vals), | ||
"Std NLL": np.std(nll_vals), | ||
"MSE": np.mean(mse_vals), | ||
} | ||
) | ||
logger.info("Finished all datasets") | ||
logger.info(results) | ||
results_df = pd.DataFrame(results) | ||
results_df.to_csv(f"result_xgboostlss.csv", index=False) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from datetime import datetime | ||
|
||
import numpy as np | ||
import properscoring as ps | ||
from loguru import logger | ||
|
||
|
||
def crps(y, pred): | ||
result = np.mean([ps.crps_gaussian(y[i], mu=pred[i, 0], | ||
sig=np.sqrt(pred[i, 1])) for i in range(len(y))]) | ||
return result | ||
|
||
|
||
# Function to compute NLL | ||
def compute_nll(y, pred): | ||
means = pred[:, 0] | ||
variances = pred[:, 1] | ||
nll = 0.5 * (np.log(2 * np.pi * variances) + | ||
((y - means) ** 2) / variances) | ||
return np.mean(nll) | ||
|
||
|
||
def timeit(method): | ||
def timed(*args, **kw): | ||
ts = datetime.now() | ||
logger.info(f"Started {method.__name__}") | ||
result = method(*args, **kw) | ||
te = datetime.now() | ||
logger.info(f"Finished {method.__name__}") | ||
logger.info(f"{method.__name__} took: {(te - ts)/60} minutes") | ||
return result | ||
return timed |