Formatted the code

LeakDetectAI · Aug 16, 2024 · 34e39f7 · 34e39f7
1 parent 50480f5
commit 34e39f7
Show file tree

Hide file tree

Showing 46 changed files with 1,724 additions and 540 deletions.
diff --git a/autoqild/__init__.py b/autoqild/__init__.py
@@ -1,5 +1,6 @@
 """Documentation of the API for public objects and functions in `autoqild`
 package."""
+
 from .automl import *
 from .bayes_search import *
 from .classifiers import *

diff --git a/autoqild/automl/__init__.py b/autoqild/automl/__init__.py
@@ -5,5 +5,6 @@
 These tools simplify the process of training, validating, and deploying
 machine learning models with minimal manual intervention.
 """
+
 from .autogluon_classifier import AutoGluonClassifier
 from .tabpfn_classifier import AutoTabPFNClassifier
diff --git a/autoqild/automl/autogluon_classifier.py b/autoqild/automl/autogluon_classifier.py
@@ -10,7 +10,7 @@
 
 from autoqild.automl.automl_core import AutomlClassifier
 from .model_configurations import hyperparameters, reduced_hyperparameters
-from ..utilities._utils import log_exception_error
+from ..utilities.utils import log_exception_error
 
 
 class AutoGluonClassifier(AutomlClassifier):
@@ -95,9 +95,21 @@ class AutoGluonClassifier(AutomlClassifier):
         Property to check if the model is already fitted.
     """
 
-    def __init__(self, n_features, n_classes, time_limit=1800, output_folder=None, eval_metric="accuracy",
-                 use_hyperparameters=True, delete_tmp_folder_after_terminate=True, auto_stack=True,
-                 remove_boosting_models=True, verbosity=6, random_state=None, **kwargs):
+    def __init__(
+        self,
+        n_features,
+        n_classes,
+        time_limit=1800,
+        output_folder=None,
+        eval_metric="accuracy",
+        use_hyperparameters=True,
+        delete_tmp_folder_after_terminate=True,
+        auto_stack=True,
+        remove_boosting_models=True,
+        verbosity=6,
+        random_state=None,
+        **kwargs,
+    ):
         self.logger = logging.getLogger(name=AutoGluonClassifier.__name__)
         self.random_state = check_random_state(random_state)
         self.output_folder = output_folder
@@ -114,7 +126,16 @@ def __init__(self, n_features, n_classes, time_limit=1800, output_folder=None, e
         else:
             self.hyperparameters = None
         if remove_boosting_models:
-            self.exclude_model_types = ["GBM", "CAT", "XGB", "LGB", "KNN", "NN_TORCH", "AG_AUTOMM", "LR"]
+            self.exclude_model_types = [
+                "GBM",
+                "CAT",
+                "XGB",
+                "LGB",
+                "KNN",
+                "NN_TORCH",
+                "AG_AUTOMM",
+                "LR",
+            ]
         else:
             self.exclude_model_types = ["AG_AUTOMM", "LR"]
         self.auto_stack = auto_stack
@@ -124,7 +145,9 @@ def __init__(self, n_features, n_classes, time_limit=1800, output_folder=None, e
         self.time_limit = time_limit
         self.model = None
         self.class_label = "class"
-        self.columns = [f"feature_{i}" for i in range(self.n_features)] + [self.class_label]
+        self.columns = [f"feature_{i}" for i in range(self.n_features)] + [
+            self.class_label
+        ]
         if self.n_classes > 2:
             self.problem_type = "multiclass"
         if self.n_classes == 2:
@@ -153,21 +176,29 @@ def _is_fitted_(self) -> bool:
 
         if self.model is not None:
             self.leaderboard = self.model.leaderboard(extra_info=True)
-            time_taken = self.leaderboard["fit_time"].sum() + self.leaderboard["pred_time_val"].sum() + 20
+            time_taken = (
+                self.leaderboard["fit_time"].sum()
+                + self.leaderboard["pred_time_val"].sum()
+                + 20
+            )
             difference = self.time_limit - time_taken
             if 200 <= self.time_limit < 300:
                 limit = 150
             elif self.time_limit >= 3000:
                 limit = 2000
             else:
                 limit = 200
-            self.logger.info(f"Fitting time of the model {time_taken} and remaining {difference}, limit {limit}")
+            self.logger.info(
+                f"Fitting time of the model {time_taken} and remaining {difference}, limit {limit}"
+            )
             num_models = len(self.leaderboard["fit_time"])
             self.logger.info(f"Number of models trained is {num_models} ")
             if num_models < 1200:
                 if num_models <= 50:
                     self.model = None
-                    self.logger.info(f"Retraining the model since they are less than 50")
+                    self.logger.info(
+                        f"Retraining the model since they are less than 50"
+                    )
                 if difference >= limit:
                     self.model = None
             else:
@@ -176,8 +207,10 @@ def _is_fitted_(self) -> bool:
         if self.model is None:
             try:
                 shutil.rmtree(self.output_folder)
-                self.logger.error(f"Since the model is not completely fitted, the folder '{basename}' "
-                                  f"and its contents are deleted successfully.")
+                self.logger.error(
+                    f"Since the model is not completely fitted, the folder '{basename}' "
+                    f"and its contents are deleted successfully."
+                )
             except OSError as error:
                 log_exception_error(self.logger, error)
                 self.logger.error(f"Folder does not exist")
@@ -202,15 +235,27 @@ def fit(self, X, y, **kwd):
         while not self._is_fitted_:
             try:
                 self.logger.info("Fitting the model from scratch")
-                self.model = TabularPredictor(label=self.class_label, sample_weight=self.sample_weight,
-                                              problem_type=self.problem_type, eval_metric=self.eval_metric,
-                                              path=self.output_folder, verbosity=self.verbosity)
-                self.model.fit(train_data, time_limit=self.time_limit, hyperparameters=self.hyperparameters,
-                               hyperparameter_tune_kwargs=self.hyperparameter_tune_kwargs, auto_stack=self.auto_stack,
-                               excluded_model_types=self.exclude_model_types)
+                self.model = TabularPredictor(
+                    label=self.class_label,
+                    sample_weight=self.sample_weight,
+                    problem_type=self.problem_type,
+                    eval_metric=self.eval_metric,
+                    path=self.output_folder,
+                    verbosity=self.verbosity,
+                )
+                self.model.fit(
+                    train_data,
+                    time_limit=self.time_limit,
+                    hyperparameters=self.hyperparameters,
+                    hyperparameter_tune_kwargs=self.hyperparameter_tune_kwargs,
+                    auto_stack=self.auto_stack,
+                    excluded_model_types=self.exclude_model_types,
+                )
             except Exception as error:
                 log_exception_error(self.logger, error)
-                self.logger.error("Fit function did not work, checking the saved models")
+                self.logger.error(
+                    "Fit function did not work, checking the saved models"
+                )
         self.leaderboard = self.model.leaderboard(extra_info=True)
         if self.delete_tmp_folder_after_terminate:
             self.model.delete_models(models_to_keep="best", dry_run=False)
@@ -334,7 +379,9 @@ def convert_to_dataframe(self, X, y=None):
         data = np.concatenate((X, y[:, None]), axis=1)
 
         if self.n_features != X.shape[-1]:
-            raise ValueError(f"Dataset passed does not contain {self.n_features} features")
+            raise ValueError(
+                f"Dataset passed does not contain {self.n_features} features"
+            )
 
         df_data = pd.DataFrame(data=data, columns=self.columns)
         return df_data

diff --git a/autoqild/automl/automl_core.py b/autoqild/automl/automl_core.py
@@ -1,4 +1,5 @@
 """Abstract base class for AutoML classifiers."""
+
 from abc import abstractmethod
 
 from sklearn.base import BaseEstimator, ClassifierMixin
@@ -60,7 +61,7 @@ class AutomlClassifier(BaseEstimator, ClassifierMixin):
 
     @abstractmethod
     def fit(self, X, y, **kwd):
-        """" Fit the AutoML classifier on the provided dataset.
+        """Fit the AutoML classifier on the provided dataset.
 
         Parameters
         ----------
@@ -89,7 +90,9 @@ def fit(self, X, y, **kwd):
         for training the classifier on the dataset provided in `X` and `y`.
         """
 
-        raise NotImplementedError("The 'fit' method must be implemented by the subclass.")
+        raise NotImplementedError(
+            "The 'fit' method must be implemented by the subclass."
+        )
 
     @abstractmethod
     def score(self, X, y, sample_weight=None, verbose=0):
@@ -120,7 +123,9 @@ def score(self, X, y, sample_weight=None, verbose=0):
         NotImplementedError
             If the method is not implemented by the subclass.
         """
-        raise NotImplementedError("The 'fit' method must be implemented by the subclass.")
+        raise NotImplementedError(
+            "The 'fit' method must be implemented by the subclass."
+        )
 
     @abstractmethod
     def predict(self, X, verbose=0):
@@ -144,7 +149,9 @@ def predict(self, X, verbose=0):
         NotImplementedError
             If the method is not implemented by the subclass.
         """
-        raise NotImplementedError("The 'predict' method must be implemented by the subclass.")
+        raise NotImplementedError(
+            "The 'predict' method must be implemented by the subclass."
+        )
 
     @abstractmethod
     def predict_proba(self, X, verbose=0):
@@ -168,7 +175,9 @@ def predict_proba(self, X, verbose=0):
         NotImplementedError
             If the method is not implemented by the subclass.
         """
-        raise NotImplementedError("The 'predict_proba' method must be implemented by the subclass.")
+        raise NotImplementedError(
+            "The 'predict_proba' method must be implemented by the subclass."
+        )
 
     @abstractmethod
     def decision_function(self, X, verbose=0):
@@ -194,7 +203,9 @@ def decision_function(self, X, verbose=0):
         NotImplementedError
             If the method is not implemented by the subclass.
         """
-        raise NotImplementedError("The 'decision_function' method must be implemented by the subclass.")
+        raise NotImplementedError(
+            "The 'decision_function' method must be implemented by the subclass."
+        )
 
     def get_params(self, deep=True):
         """Get parameters for this estimator.

diff --git a/autoqild/automl/model_configurations.py b/autoqild/automl/model_configurations.py
@@ -1,4 +1,5 @@
 """Configurations for search space for AutoGluon tools."""
+
 from autogluon.common.space import Real, Int, Categorical
 
 hyperparameters = {
@@ -36,7 +37,10 @@
         "learning_rate": Real(1e-5, 1e-1, default=5e-4, log=True),
         "wd": Real(1e-6, 1e-1, default=5e-4, log=True),
         "emb_drop": Real(0.0, 0.5),
-        "ps": Real(0.0, 0.5, ),
+        "ps": Real(
+            0.0,
+            0.5,
+        ),
         "smoothing": Real(0.0, 0.5),
     },
     "RF": {
@@ -46,7 +50,7 @@
         "max_features": Categorical("sqrt", "log2"),
         "min_samples_leaf": Int(lower=2, upper=50, default=10),
         "min_samples_split": Int(lower=2, upper=50, default=10),
-        "class_weight": Categorical("balanced", "balanced_subsample")
+        "class_weight": Categorical("balanced", "balanced_subsample"),
     },
     "XT": {
         "n_estimators": Int(20, 300),
@@ -55,7 +59,7 @@
         "max_features": Categorical("sqrt", "log2"),
         "min_samples_leaf": Int(lower=2, upper=50, default=10),
         "min_samples_split": Int(lower=2, upper=50, default=10),
-        "class_weight": Categorical("balanced", "balanced_subsample")
+        "class_weight": Categorical("balanced", "balanced_subsample"),
     },
     "KNN": {
         "weights": Categorical("uniform", "distance"),
@@ -76,7 +80,10 @@
         "learning_rate": Real(1e-5, 1e-1, default=5e-4, log=True),
         "wd": Real(1e-6, 1e-1, default=5e-4, log=True),
         "emb_drop": Real(0.0, 0.5),
-        "ps": Real(0.0, 0.5, ),
+        "ps": Real(
+            0.0,
+            0.5,
+        ),
         "smoothing": Real(0.0, 0.5),
     },
     "RF": {
@@ -86,7 +93,7 @@
         "max_features": Categorical("sqrt", "log2"),
         "min_samples_leaf": Int(lower=2, upper=50, default=10),
         "min_samples_split": Int(lower=2, upper=50, default=10),
-        "class_weight": Categorical("balanced")
+        "class_weight": Categorical("balanced"),
     },
     "XT": {
         "n_estimators": Int(20, 300),
@@ -95,7 +102,7 @@
         "max_features": Categorical("sqrt", "log2"),
         "min_samples_leaf": Int(lower=2, upper=50, default=10),
         "min_samples_split": Int(lower=2, upper=50, default=10),
-        "class_weight": Categorical("balanced")
+        "class_weight": Categorical("balanced"),
     },
 }
 """This dictionary defines the hyperparameters for simpler models like