fix: MLFlow

IVproger · Jul 16, 2024 · d533dce · d533dce
1 parent 32ef028
commit d533dce
Show file tree

Hide file tree

Showing 8 changed files with 64 additions and 67 deletions.
diff --git a/configs/experiment.yaml b/configs/experiment.yaml
@@ -3,7 +3,7 @@ run_name: "runner"
 
 test_size: 0.2
 random_state: 88
-cv_n_jobs: 5
+cv_n_jobs: -1
 
 train_data_version: "v1.0"
 test_data_version: "v2.0"
diff --git a/configs/main.yaml b/configs/main.yaml
@@ -7,9 +7,6 @@ defaults:
 
 hydra:
   mode: MULTIRUN
-  # launcher:
-  #   n_jobs: -1
   sweeper:
     params:
-      +model: "lr"
-
+      +model: "lr, rf"
diff --git a/configs/model/lr.yaml b/configs/model/lr.yaml
@@ -7,6 +7,11 @@ tag_value: "basic LR"
 module_name: "sklearn.linear_model"
 class_name: "LogisticRegression"
 
-params:
-  solver: ["saga"]
-  C: [0.1]
+hydra:
+  sweeper:
+    params:
+      penalty: ['l1', 'l2']
+      solver: ['saga']
+      max_iter: [100, 200, 1000]
+      random_state: [88]
+      C: [0.1, 0.5, 0.8]
diff --git a/configs/model/model.yaml b/configs/model/model.yaml
@@ -2,9 +2,6 @@ defaults:
   - _self_
 
 folds: 5
-best_model_name: ""
-best_model_alias: ""
-labels: ""
 
 evaluation_metric: "f1"
 cv_evaluation_metric: "mean_test_f1"
@@ -14,3 +11,7 @@ pyfunc_predict_fn: "predict_proba"
 metrics:
   accuracy: "accuracy"
   f1: "f1"
+
+best_model_name: ""
+best_model_alias: ""
+labels: ""
diff --git a/configs/model/rf.yaml b/configs/model/rf.yaml
@@ -7,7 +7,9 @@ tag_value: "basic RF"
 module_name: "sklearn.ensemble"
 class_name: "RandomForestClassifier"
 
-params:
-  n_estimators: [100, 200, 500]
-  criterion: ['gini', 'entropy', 'log_loss']
-  random_state: [88] #, 100, 44]
+hydra:
+  sweeper:
+    params:
+      n_estimators: [100, 200, 500]
+      criterion: ['gini', 'entropy', 'log_loss']
+      random_state: [88]
diff --git a/src/data.py b/src/data.py
@@ -143,6 +143,8 @@ def preprocess_data(
     for tf in cfg["time_features"]:
         df = dtf.encode_cyclic_time_data(df, tf[0], tf[1])
 
+    df = df.sample(n=10_000)
+
     # 5. Split the dataset into X and y
     X = df.drop(["Cancelled"], axis=1)
     y = df[["Cancelled"]]
@@ -185,7 +187,7 @@ def load_features(X: pd.DataFrame, y: pd.DataFrame, version: str) -> None:
 
 def extract_data(version: str, cfg: DictConfig):
     with dvc.api.open(cfg.data.sample_path, rev=version) as fd:
-        return pd.read_csv(fd), version
+        return pd.read_csv(fd).sample(10_000), version
 
 
 if __name__ == "__main__":

diff --git a/src/main.py b/src/main.py
@@ -7,7 +7,7 @@ def run(cfg: DictConfig):
     train_data_version = cfg.train_data_version
 
     print("Fetching features...")
-    X_train, X_test, y_train, y_test = fetch_features(
+    X_train, y_train = fetch_features(
         name="features_target",
         version=train_data_version,
         cfg=cfg,
@@ -20,6 +20,13 @@ def run(cfg: DictConfig):
         cfg,
     )
 
+    test_data_version = cfg.test_data_version
+    X_test, y_test = fetch_features(
+        name="features_target",
+        version=test_data_version,
+        cfg=cfg,
+    )
+
     print("Logging metadata...")
     log_metadata(
         cfg,

diff --git a/src/model.py b/src/model.py
@@ -1,16 +1,11 @@
-import warnings
-
-warnings.filterwarnings("ignore")
-from omegaconf import DictConfig
-from sklearn.model_selection import GridSearchCV, train_test_split  # noqa: E402
-from zenml.client import Client  # noqa: E402
 import pandas as pd
-from pandas import DataFrame # noqa: E402
 import mlflow  # noqa: E402
 import mlflow.sklearn  # noqa: E402
 import importlib  # noqa: E402
-import dvc.api
-from src.data import preprocess_data
+from omegaconf import DictConfig
+from sklearn.model_selection import GridSearchCV  # noqa: E402
+from pandas import DataFrame  # noqa: E402
+from src.data import preprocess_data, extract_data  # noqa: E402
 
 
 mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
@@ -22,17 +17,9 @@ def fetch_features(name: str, version: str, cfg: DictConfig):
     # lst = client.list_artifact_versions(name=name, tag=version, sort_by="version").items
     # lst.reverse()
 
-    with dvc.api.open("data/samples/sample.csv", rev="v1.0") as fd:
-        X, y = preprocess_data(cfg, pd.read_csv(fd))
-        X_train, X_test, y_train, y_test = train_test_split(
-            X,
-            y,
-            test_size=cfg.test_size,
-            random_state=cfg.random_state,
-            stratify=y,
-            shuffle=True,
-        )
-        return X_train, X_test, y_train, y_test
+    df, _ = extract_data(version, cfg)
+    X, y = preprocess_data(cfg, df)
+    return X, y
 
 
 def train(
@@ -41,7 +28,7 @@ def train(
     cfg: DictConfig,
 ):
     # Define the model hyperparameters
-    params = cfg.model.params
+    params = cfg.model.hydra.sweeper.params
 
     # Train the model
     module_name = cfg.model.module_name
@@ -76,6 +63,7 @@ def train(
     # Define evaluation metric
     evaluation_metric = cfg.model.evaluation_metric
 
+    # Instantiate GridSearch
     gs = GridSearchCV(
         estimator=estimator,
         param_grid=param_grid,
@@ -87,7 +75,8 @@ def train(
         return_train_score=True,
     )
 
-    gs.fit(X_train, y_train)
+    # Fit GridSearch
+    gs.fit(X_train, y_train.values.ravel())
 
     return gs
 
@@ -171,6 +160,7 @@ def log_metadata(
 
         # Log the performance metrics
         mlflow.log_metrics(best_metrics_dict)
+        print("DEBUG:", best_metrics_dict)
 
         # Set a tag that we can use to remind ourselves what this run was for
         mlflow.set_tag(cfg.model.tag_key, cfg.model.tag_value)
@@ -200,7 +190,9 @@ def log_metadata(
         for index, result in cv_results.iterrows():
             child_run_name = "_".join(["child", run_name, str(index)])
             with mlflow.start_run(
-                run_name=child_run_name, experiment_id=experiment_id, nested=True
+                run_name=child_run_name,
+                experiment_id=experiment_id,
+                nested=True,
             ):
                 ps = result.filter(regex="param_").to_dict()
                 ms = result.filter(regex="mean_").to_dict()
@@ -225,15 +217,6 @@ def log_metadata(
                 estimator = class_instance(**ps)
                 estimator.fit(X_train, y_train)
 
-                # from sklearn.model_selection import cross_val_score
-                # scores = cross_val_score(estimator=estimator,
-                #                          X_train,
-                #                          y_train,
-                #                          cv = cfg.model.folds,
-                #                          n_jobs=cfg.cv_n_jobs,
-                #                          scoring=cfg.model.cv_evaluation_metric)
-                # cv_evaluation_metric = scores.mean()
-
                 signature = mlflow.models.infer_signature(
                     X_train, estimator.predict(X_train)
                 )
@@ -266,28 +249,28 @@ def log_metadata(
 
                 print(f"metrics:\n{results.metrics}")
 
-            # mlflow.end_run()
+                mlflow.end_run()
 
-    # mlflow.end_run()
+        mlflow.end_run()
 
 
-# def retrieve_model_with_alias(
-#     model_name, model_alias="champion"
-# ) -> mlflow.pyfunc.PyFuncModel:
-#     best_model: mlflow.pyfunc.PyFuncModel = mlflow.pyfunc.load_model(
-#         model_uri=f"models:/{model_name}@{model_alias}"
-#     )
+def retrieve_model_with_alias(
+    model_name, model_alias="champion"
+) -> mlflow.pyfunc.PyFuncModel:
+    best_model: mlflow.pyfunc.PyFuncModel = mlflow.pyfunc.load_model(
+        model_uri=f"models:/{model_name}@{model_alias}"
+    )
 
-#     # best_model
-#     return best_model
+    # best_model
+    return best_model
 
 
-# def retrieve_model_with_version(
-#     model_name, model_version="v1"
-# ) -> mlflow.pyfunc.PyFuncModel:
-#     best_model: mlflow.pyfunc.PyFuncModel = mlflow.pyfunc.load_model(
-#         model_uri=f"models:/{model_name}/{model_version}"
-#     )
+def retrieve_model_with_version(
+    model_name, model_version="v1"
+) -> mlflow.pyfunc.PyFuncModel:
+    best_model: mlflow.pyfunc.PyFuncModel = mlflow.pyfunc.load_model(
+        model_uri=f"models:/{model_name}/{model_version}"
+    )
 
-#     # best_model
-#     return best_model
+    # best_model
+    return best_model