Skip to content

Commit

Permalink
Removed useless functions from utils
Browse files Browse the repository at this point in the history
  • Loading branch information
prithagupta committed Aug 16, 2024
1 parent 47d3d3d commit 26e87c7
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 279 deletions.
1 change: 1 addition & 0 deletions autoqild/utilities/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ def get_entropy_y(y_true):
entropy_y += -py * np.log2(py)
return entropy_y


def pc_softmax_estimation(y_true, p_pred):
"""Estimates the mutual information using predicted probabilities in the
softmax and PC-Softmax functions.
Expand Down
289 changes: 10 additions & 279 deletions docs/source/notebooks/utils.py
Original file line number Diff line number Diff line change
@@ -1,221 +1,19 @@
import inspect
import json
import logging
import multiprocessing
import os
import random
import re
import sys
from datetime import datetime, timedelta

import dill
import numpy as np
import openml
import psycopg2
import sklearn
import tensorflow as tf
import torch
from netcal.binning import IsotonicRegression, HistogramBinning
from netcal.scaling import LogisticCalibration, BetaCalibration, TemperatureScaling
from packaging import version
from sklearn.ensemble import (
RandomForestClassifier,
GradientBoostingClassifier,
AdaBoostClassifier,
ExtraTreesClassifier,
)
from sklearn.linear_model import RidgeClassifier, SGDClassifier
from sklearn.metrics import (
f1_score,
accuracy_score,
matthews_corrcoef,
mutual_info_score,
balanced_accuracy_score,
)
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.utils import check_random_state
from skopt.space import Real, Categorical, Integer

from autoqild import *

__all__ = [
"datasets",
"classifiers",
"calibrators",
"calibrator_params",
"mi_estimators",
"get_dataset_reader",
"learners",
"classification_metrics",
"mi_estimation_metrics",
"mi_metrics",
"lp_metric_dict",
"leakage_detectors",
"get_duration_seconds",
"duration_till_now",
"time_from_now",
"get_dataset_reader",
"seconds_to_time",
"time_from_now",
"create_search_space",
"get_dataset_reader",
"convert_learner_params",
"setup_logging",
"setup_random_seed",
"check_file_exists",
"get_automl_learned_estimator",
"get_time_taken",
"get_openml_datasets",
]


datasets = {
SYNTHETIC_DATASET: SyntheticDatasetGenerator,
SYNTHETIC_DISTANCE_DATASET: SyntheticDatasetGeneratorDistance,
SYNTHETIC_IMBALANCED_DATASET: SyntheticDatasetGenerator,
SYNTHETIC_DISTANCE_IMBALANCED_DATASET: SyntheticDatasetGeneratorDistance,
OPENML_DATASET: OpenMLTimingDatasetReader,
OPENML_PADDING_DATASET: OpenMLPaddingDatasetReader,
}
classifiers = {
MULTI_LAYER_PERCEPTRON: MultiLayerPerceptron,
SGD_CLASSIFIER: SGDClassifier,
RIDGE_CLASSIFIER: RidgeClassifier,
LINEAR_SVC: LinearSVC,
DECISION_TREE: DecisionTreeClassifier,
EXTRA_TREE: ExtraTreeClassifier,
RANDOM_FOREST: RandomForestClassifier,
EXTRA_TREES: ExtraTreesClassifier,
ADA_BOOST_CLASSIFIER: AdaBoostClassifier,
GRADIENT_BOOSTING_CLASSIFICATION: GradientBoostingClassifier,
BAYES_PREDICTOR: BayesPredictor,
MAJORITY_VOTING: MajorityVoting,
AUTO_GLUON: AutoGluonClassifier,
TABPFN: AutoTabPFNClassifier,
}

calibrators = {
ISOTONIC_REGRESSION: IsotonicRegression,
PLATT_SCALING: LogisticCalibration,
HISTOGRAM_BINNING: HistogramBinning,
BETA_CALIBRATION: BetaCalibration,
TEMPERATURE_SCALING: TemperatureScaling,
}
calibrator_params = {
ISOTONIC_REGRESSION: {"detection": False, "independent_probabilities": False},
PLATT_SCALING: {"temperature_only": False, "method": "mle"},
HISTOGRAM_BINNING: {"detection": False, "independent_probabilities": False},
BETA_CALIBRATION: {"detection": False, "independent_probabilities": False},
TEMPERATURE_SCALING: {"detection": False, "independent_probabilities": False},
}
mi_estimators = {
GMM_MI_ESTIMATOR: GMMMIEstimator,
"gmm_mi_estimator_more_instances": GMMMIEstimator,
"gmm_mi_estimator_true": GMMMIEstimator,
"gmm_mi_estimator_more_instances_true": GMMMIEstimator,
MINE_MI_ESTIMATOR: MineMIEstimator,
MINE_MI_ESTIMATOR_HPO: MineMIEstimatorMSE,
"softmax_mi_estimator": PCSoftmaxMIEstimator,
"pc_softmax_mi_estimator": PCSoftmaxMIEstimator,
}

leakage_detectors = {
AUTO_GLUON: AutoGluonLeakageDetector,
AUTO_GLUON_STACK: AutoGluonLeakageDetector,
TABPFN_VAR: TabPFNLeakageDetector,
TABPFN: TabPFNLeakageDetector,
RANDOM_FOREST: RandomForestLeakageDetector,
MULTI_LAYER_PERCEPTRON: MLPLeakageDetector,
MINE_MI_ESTIMATOR: MIEstimationLeakageDetector,
GMM_MI_ESTIMATOR: MIEstimationLeakageDetector,
}

learners = {**classifiers, **mi_estimators}

classification_metrics = {
ACCURACY: accuracy_score,
F_SCORE: f1_score,
# AUC_SCORE: auc_score,
MCC: matthews_corrcoef,
# INFORMEDNESS: balanced_accuracy_score,
ESTIMATED_MUTUAL_INFORMATION_SCORE: mutual_info_score,
SANTHIUB: santhi_vardi_upper_bound,
HELLMANUB: helmann_raviv_upper_bound,
FANOSLB: fanos_lower_bound,
FANOS_ADJUSTEDLB: fanos_adjusted_lower_bound,
}
mi_estimation_metrics = {
MCMC_MI_ESTIMATION: None,
MCMC_LOG_LOSS: None,
MCMC_PC_SOFTMAX: None,
MCMC_SOFTMAX: None,
MID_POINT_MI_ESTIMATION: mid_point_mi,
LOG_LOSS_MI_ESTIMATION: log_loss_estimation,
LOG_LOSS_MI_ESTIMATION_ISOTONIC_REGRESSION: log_loss_estimation,
LOG_LOSS_MI_ESTIMATION_PLATT_SCALING: log_loss_estimation,
LOG_LOSS_MI_ESTIMATION_BETA_CALIBRATION: log_loss_estimation,
LOG_LOSS_MI_ESTIMATION_TEMPERATURE_SCALING: log_loss_estimation,
LOG_LOSS_MI_ESTIMATION_HISTOGRAM_BINNING: log_loss_estimation,
PC_SOFTMAX_MI_ESTIMATION: pc_softmax_estimation,
PC_SOFTMAX_MI_ESTIMATION_ISOTONIC_REGRESSION: pc_softmax_estimation,
PC_SOFTMAX_MI_ESTIMATION_PLATT_SCALING: pc_softmax_estimation,
PC_SOFTMAX_MI_ESTIMATION_BETA_CALIBRATION: pc_softmax_estimation,
PC_SOFTMAX_MI_ESTIMATION_TEMPERATURE_SCALING: pc_softmax_estimation,
PC_SOFTMAX_MI_ESTIMATION_HISTOGRAM_BINNING: pc_softmax_estimation,
}
ild_metrics = {
ACCURACY: accuracy_score,
F_SCORE: f1_score,
# AUC_SCORE: auc_score,
MCC: matthews_corrcoef,
INFORMEDNESS: balanced_accuracy_score,
FPR: false_positive_rate,
FNR: false_negative_rate,
}
mi_metrics = {
ESTIMATED_MUTUAL_INFORMATION: None,
MCMC_MI_ESTIMATION: None,
MCMC_LOG_LOSS: None,
MCMC_PC_SOFTMAX: None,
MCMC_SOFTMAX: None,
}
lp_metric_dict = {
AUTO_ML: {**classification_metrics, **mi_estimation_metrics},
CLASSIFICATION: {**classification_metrics, **mi_estimation_metrics},
MUTUAL_INFORMATION: {**mi_metrics, **classification_metrics},
MUTUAL_INFORMATION_NEW: {**mi_metrics, **classification_metrics},
LEAKAGE_DETECTION: ild_metrics,
}


def get_duration_seconds(duration):
time = int(re.findall(r"\d+", duration)[0])
d = duration.split(str(time))[1].upper()
options = {"D": 24 * 60 * 60, "H": 60 * 60, "M": 60}
return options[d] * time


def duration_till_now(start):
return (datetime.now() - start).total_seconds()


def seconds_to_time(target_time_sec):
return str(timedelta(seconds=target_time_sec))


def time_from_now(target_time_sec):
base_datetime = datetime.now()
delta = timedelta(seconds=target_time_sec)
target_date = base_datetime + delta
return target_date.strftime("%Y-%m-%d %H:%M:%S")


def get_dataset_reader(dataset_name, dataset_params):
dataset_func = datasets[dataset_name]
dataset_func = dataset_func(**dataset_params)
return dataset_func


def create_search_space(hp_ranges, logger):
def isint(v):
Expand Down Expand Up @@ -283,8 +81,16 @@ def setup_logging(log_path=None, level=logging.INFO):
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Suppresses INFO, WARNING, and ERROR logs
# Additional TensorFlow setting to disable GPU usage explicitly
tf.config.set_visible_devices([], "GPU")
# logging.captureWarnings(True)
logging.getLogger("pyro").setLevel(logging.ERROR)
logging.captureWarnings(False)
import warnings

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("matplotlib").setLevel(logging.ERROR)
logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("pytorch").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)


def setup_random_seed(random_state=1234):
Expand Down Expand Up @@ -329,78 +135,3 @@ def setup_random_seed(random_state=1234):
tf.config.experimental.set_memory_growth(gpu, True)
torch_gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info("Torch GPU device {}".format(torch_gpu))


def check_file_exists(file_path):
file_path = os.path.normpath(file_path)
if not os.path.exists(file_path):
print("Error: provided file path '%s' does not exist!" % file_path)
sys.exit(-1)
return


def get_automl_learned_estimator(optimizers_file_path, logger):
try:
estimator = dill.load(open(optimizers_file_path, "rb"))
except Exception as error:
log_exception_error(logger, error)
logger.error(f"No such file or directory: {optimizers_file_path}")
estimator = None
return estimator


def get_openml_datasets():
YOUR_API_KEY = "2e5bf0586e06bc552a66c263cbdbd52f"
USER_ID = "2086"
openml.config.apikey = YOUR_API_KEY
datasets = openml.datasets.list_datasets()
openml_datasets = {}
for dataset_id, dataset in datasets.items():
# print(dataset)
if dataset["uploader"] == USER_ID:
# print(dataset['name'])
openml_datasets[dataset_id] = {
"name": dataset["name"],
"link": f"https://www.openml.org/d/{dataset_id}",
}
return openml_datasets


def get_openml_padding_datasets():
YOUR_API_KEY = "2e5bf0586e06bc552a66c263cbdbd52f"
USER_ID = "2086"
openml.config.apikey = YOUR_API_KEY
filtered_datasets = [
dataset_id
for dataset_id, dataset in openml.datasets.list_datasets().items()
if dataset["uploader"] == USER_ID
]
openml_datasets = {}
for dataset_id in filtered_datasets:
dataset = openml.datasets.get_dataset(dataset_id)
if "Padding" in dataset.description:
openml_datasets[dataset_id] = {
"name": dataset.name,
"link": f"https://www.openml.org/d/{dataset_id}",
}
return openml_datasets


def get_time_taken(log_path):
if os.path.exists(log_path):
with open(log_path, "r") as file:
log_content = file.read()

# Use regular expressions to find the time value in the log content
pattern = r"total-time\s+(\d+\.\d+)"
match = re.search(pattern, log_content)

if match:
time_value = float(match.group(1))
print("Time value:", time_value)
else:
print("Time value not found in the log file.")
time_value = 0
else:
time_value = 0
return time_value

0 comments on commit 26e87c7

Please sign in to comment.