Skip to content

Commit

Permalink
fix: MLFlow
Browse files Browse the repository at this point in the history
  • Loading branch information
da-the-dev committed Jul 16, 2024
1 parent 32ef028 commit d533dce
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 67 deletions.
2 changes: 1 addition & 1 deletion configs/experiment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ run_name: "runner"

test_size: 0.2
random_state: 88
cv_n_jobs: 5
cv_n_jobs: -1

train_data_version: "v1.0"
test_data_version: "v2.0"
5 changes: 1 addition & 4 deletions configs/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@ defaults:

hydra:
mode: MULTIRUN
# launcher:
# n_jobs: -1
sweeper:
params:
+model: "lr"

+model: "lr, rf"
11 changes: 8 additions & 3 deletions configs/model/lr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ tag_value: "basic LR"
module_name: "sklearn.linear_model"
class_name: "LogisticRegression"

params:
solver: ["saga"]
C: [0.1]
hydra:
sweeper:
params:
penalty: ['l1', 'l2']
solver: ['saga']
max_iter: [100, 200, 1000]
random_state: [88]
C: [0.1, 0.5, 0.8]
7 changes: 4 additions & 3 deletions configs/model/model.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@ defaults:
- _self_

folds: 5
best_model_name: ""
best_model_alias: ""
labels: ""

evaluation_metric: "f1"
cv_evaluation_metric: "mean_test_f1"
Expand All @@ -14,3 +11,7 @@ pyfunc_predict_fn: "predict_proba"
metrics:
accuracy: "accuracy"
f1: "f1"

best_model_name: ""
best_model_alias: ""
labels: ""
10 changes: 6 additions & 4 deletions configs/model/rf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ tag_value: "basic RF"
module_name: "sklearn.ensemble"
class_name: "RandomForestClassifier"

params:
n_estimators: [100, 200, 500]
criterion: ['gini', 'entropy', 'log_loss']
random_state: [88] #, 100, 44]
hydra:
sweeper:
params:
n_estimators: [100, 200, 500]
criterion: ['gini', 'entropy', 'log_loss']
random_state: [88]
4 changes: 3 additions & 1 deletion src/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ def preprocess_data(
for tf in cfg["time_features"]:
df = dtf.encode_cyclic_time_data(df, tf[0], tf[1])

df = df.sample(n=10_000)

# 5. Split the dataset into X and y
X = df.drop(["Cancelled"], axis=1)
y = df[["Cancelled"]]
Expand Down Expand Up @@ -185,7 +187,7 @@ def load_features(X: pd.DataFrame, y: pd.DataFrame, version: str) -> None:

def extract_data(version: str, cfg: DictConfig):
with dvc.api.open(cfg.data.sample_path, rev=version) as fd:
return pd.read_csv(fd), version
return pd.read_csv(fd).sample(10_000), version


if __name__ == "__main__":
Expand Down
9 changes: 8 additions & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def run(cfg: DictConfig):
train_data_version = cfg.train_data_version

print("Fetching features...")
X_train, X_test, y_train, y_test = fetch_features(
X_train, y_train = fetch_features(
name="features_target",
version=train_data_version,
cfg=cfg,
Expand All @@ -20,6 +20,13 @@ def run(cfg: DictConfig):
cfg,
)

test_data_version = cfg.test_data_version
X_test, y_test = fetch_features(
name="features_target",
version=test_data_version,
cfg=cfg,
)

print("Logging metadata...")
log_metadata(
cfg,
Expand Down
83 changes: 33 additions & 50 deletions src/model.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import warnings

warnings.filterwarnings("ignore")
from omegaconf import DictConfig
from sklearn.model_selection import GridSearchCV, train_test_split # noqa: E402
from zenml.client import Client # noqa: E402
import pandas as pd
from pandas import DataFrame # noqa: E402
import mlflow # noqa: E402
import mlflow.sklearn # noqa: E402
import importlib # noqa: E402
import dvc.api
from src.data import preprocess_data
from omegaconf import DictConfig
from sklearn.model_selection import GridSearchCV # noqa: E402
from pandas import DataFrame # noqa: E402
from src.data import preprocess_data, extract_data # noqa: E402


mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
Expand All @@ -22,17 +17,9 @@ def fetch_features(name: str, version: str, cfg: DictConfig):
# lst = client.list_artifact_versions(name=name, tag=version, sort_by="version").items
# lst.reverse()

with dvc.api.open("data/samples/sample.csv", rev="v1.0") as fd:
X, y = preprocess_data(cfg, pd.read_csv(fd))
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=cfg.test_size,
random_state=cfg.random_state,
stratify=y,
shuffle=True,
)
return X_train, X_test, y_train, y_test
df, _ = extract_data(version, cfg)
X, y = preprocess_data(cfg, df)
return X, y


def train(
Expand All @@ -41,7 +28,7 @@ def train(
cfg: DictConfig,
):
# Define the model hyperparameters
params = cfg.model.params
params = cfg.model.hydra.sweeper.params

# Train the model
module_name = cfg.model.module_name
Expand Down Expand Up @@ -76,6 +63,7 @@ def train(
# Define evaluation metric
evaluation_metric = cfg.model.evaluation_metric

# Instantiate GridSearch
gs = GridSearchCV(
estimator=estimator,
param_grid=param_grid,
Expand All @@ -87,7 +75,8 @@ def train(
return_train_score=True,
)

gs.fit(X_train, y_train)
# Fit GridSearch
gs.fit(X_train, y_train.values.ravel())

return gs

Expand Down Expand Up @@ -171,6 +160,7 @@ def log_metadata(

# Log the performance metrics
mlflow.log_metrics(best_metrics_dict)
print("DEBUG:", best_metrics_dict)

# Set a tag that we can use to remind ourselves what this run was for
mlflow.set_tag(cfg.model.tag_key, cfg.model.tag_value)
Expand Down Expand Up @@ -200,7 +190,9 @@ def log_metadata(
for index, result in cv_results.iterrows():
child_run_name = "_".join(["child", run_name, str(index)])
with mlflow.start_run(
run_name=child_run_name, experiment_id=experiment_id, nested=True
run_name=child_run_name,
experiment_id=experiment_id,
nested=True,
):
ps = result.filter(regex="param_").to_dict()
ms = result.filter(regex="mean_").to_dict()
Expand All @@ -225,15 +217,6 @@ def log_metadata(
estimator = class_instance(**ps)
estimator.fit(X_train, y_train)

# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(estimator=estimator,
# X_train,
# y_train,
# cv = cfg.model.folds,
# n_jobs=cfg.cv_n_jobs,
# scoring=cfg.model.cv_evaluation_metric)
# cv_evaluation_metric = scores.mean()

signature = mlflow.models.infer_signature(
X_train, estimator.predict(X_train)
)
Expand Down Expand Up @@ -266,28 +249,28 @@ def log_metadata(

print(f"metrics:\n{results.metrics}")

# mlflow.end_run()
mlflow.end_run()

# mlflow.end_run()
mlflow.end_run()


# def retrieve_model_with_alias(
# model_name, model_alias="champion"
# ) -> mlflow.pyfunc.PyFuncModel:
# best_model: mlflow.pyfunc.PyFuncModel = mlflow.pyfunc.load_model(
# model_uri=f"models:/{model_name}@{model_alias}"
# )
def retrieve_model_with_alias(
model_name, model_alias="champion"
) -> mlflow.pyfunc.PyFuncModel:
best_model: mlflow.pyfunc.PyFuncModel = mlflow.pyfunc.load_model(
model_uri=f"models:/{model_name}@{model_alias}"
)

# # best_model
# return best_model
# best_model
return best_model


# def retrieve_model_with_version(
# model_name, model_version="v1"
# ) -> mlflow.pyfunc.PyFuncModel:
# best_model: mlflow.pyfunc.PyFuncModel = mlflow.pyfunc.load_model(
# model_uri=f"models:/{model_name}/{model_version}"
# )
def retrieve_model_with_version(
model_name, model_version="v1"
) -> mlflow.pyfunc.PyFuncModel:
best_model: mlflow.pyfunc.PyFuncModel = mlflow.pyfunc.load_model(
model_uri=f"models:/{model_name}/{model_version}"
)

# # best_model
# return best_model
# best_model
return best_model

0 comments on commit d533dce

Please sign in to comment.