diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..e69de29 diff --git a/dvc.lock b/dvc.lock index 798d37f..1b143b6 100644 --- a/dvc.lock +++ b/dvc.lock @@ -6,14 +6,14 @@ stages: deps: - path: src/pipeline/preprocess hash: md5 - md5: 8260624b4b5a55764d29de5f1693b825.dir - size: 1572 + md5: c0103315f729a77ca07420c77735a9de.dir + size: 1066 nfiles: 2 outs: - path: data/preprocess hash: md5 - md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir - size: 99640 + md5: e3e7f02c0d35afc5cccf9786ffdbc061.dir + size: 134027 nfiles: 3 train: cmd: python -m src.pipeline.train.training --preprocess-folder "data/preprocess/" @@ -21,13 +21,13 @@ stages: deps: - path: data/preprocess hash: md5 - md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir - size: 99640 + md5: e3e7f02c0d35afc5cccf9786ffdbc061.dir + size: 134027 nfiles: 3 - path: src/pipeline/train hash: md5 - md5: 77e525a708041d3b75310f17c6506b86.dir - size: 1469 + md5: 97cad4658c6c59e05ab81b676d04abca.dir + size: 1053 nfiles: 2 params: params.py: @@ -35,8 +35,8 @@ stages: outs: - path: data/train hash: md5 - md5: 93fc32f25b693493ec68313a5492d32b.dir - size: 3396162 + md5: 5f94e76055cef2085b7f2f8e2d1f8678.dir + size: 3396047 nfiles: 1 evaluate: cmd: python -m src.pipeline.evaluate.evaluation --preprocess-folder "data/preprocess" @@ -44,16 +44,16 @@ stages: deps: - path: data/preprocess hash: md5 - md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir - size: 99640 + md5: e3e7f02c0d35afc5cccf9786ffdbc061.dir + size: 134027 nfiles: 3 - path: data/train hash: md5 - md5: 93fc32f25b693493ec68313a5492d32b.dir - size: 3396162 + md5: 5f94e76055cef2085b7f2f8e2d1f8678.dir + size: 3396047 nfiles: 1 - path: src/pipeline/evaluate hash: md5 - md5: ea95423ef009670bf0d89fd0a9a3ed50.dir - size: 1045 + md5: 3a95133c38b6b72f12c0451eed8baa27.dir + size: 1082 nfiles: 2 diff --git a/src/pipeline/evaluate/evaluation.py b/src/pipeline/evaluate/evaluation.py index a754f1a..6c7eceb 100644 --- a/src/pipeline/evaluate/evaluation.py +++ b/src/pipeline/evaluate/evaluation.py @@ -1,12 +1,36 @@ """Module for model evaluation.""" +import pickle from pathlib import Path +from statistics import mean +import pandas as pd import typer +from sklearn.metrics import mean_absolute_error def main(preprocess_folder: Path = typer.Option(...), train_folder: Path = typer.Option(...)) -> None: """Main function.""" + df_train = pd.read_parquet(preprocess_folder / "df_train.parquet") + df_test = pd.read_parquet(preprocess_folder / "df_test.parquet") + + with open(train_folder / "model.pkl", "rb") as f: + model = pickle.load(f) + target_column = "average_grade" + X_train = df_train.drop(columns=[target_column]) + y_train = df_train[target_column] + model.fit(X_train, y_train) + + X_test = df_test.drop(columns=[target_column]) + y_test = df_test[target_column] + predictions = model.predict(X_test) + + baseline_pred = y_train.mean() + + mae = mean_absolute_error(y_test, predictions) + mae_baseline = mean_absolute_error(y_test, [baseline_pred] * len(y_test)) + + print(f"{mae=}, {mae_baseline=}") if __name__ == "__main__": diff --git a/src/pipeline/preprocess/preprocessing.py b/src/pipeline/preprocess/preprocessing.py index abcd121..94dbba6 100644 --- a/src/pipeline/preprocess/preprocessing.py +++ b/src/pipeline/preprocess/preprocessing.py @@ -2,13 +2,31 @@ from pathlib import Path +import numpy as np +import pandas as pd import typer +from sklearn.model_selection import train_test_split def main(input_folder: Path = typer.Option(...), output_folder: Path = typer.Option(...)) -> None: """Main function.""" output_folder.mkdir(exist_ok=True, parents=True) + df = pd.read_csv(input_folder / "student-por.csv") + + categorical_variables = [col for col, dtype in df.items() if not np.issubdtype(dtype, np.number)] + df = pd.get_dummies(df, columns=categorical_variables, drop_first=False, dtype=float) + + df["average_grade"] = (df["G1"] + df["G2"] + df["G3"]) / 3.0 + df = df.drop(["G1", "G2", "G3"], axis="columns") + + df_train, df_test = train_test_split(df, test_size=0.2) + df_train, df_val = train_test_split(df_train, test_size=0.2) + + df_train.to_parquet(output_folder / "df_train.parquet") + df_val.to_parquet(output_folder / "df_val.parquet") + df_test.to_parquet(output_folder / "df_test.parquet") + if __name__ == "__main__": typer.run(main) diff --git a/src/pipeline/train/training.py b/src/pipeline/train/training.py index 0c14b0b..02580d2 100644 --- a/src/pipeline/train/training.py +++ b/src/pipeline/train/training.py @@ -1,14 +1,37 @@ """Module for training the model.""" +import pickle from pathlib import Path +import pandas as pd import typer +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_absolute_error def main(preprocess_folder: Path = typer.Option(...), output_folder: Path = typer.Option(...)) -> None: """Main function.""" output_folder.mkdir(exist_ok=True, parents=True) + df_train = pd.read_parquet(preprocess_folder / "df_train.parquet") + df_val = pd.read_parquet(preprocess_folder / "df_val.parquet") + + model = RandomForestRegressor() + + target_column = "average_grade" + X_train = df_train.drop(columns=[target_column]) + y_train = df_train[target_column] + model.fit(X_train, y_train) + + X_val = df_val.drop(columns=[target_column]) + y_val = df_val[target_column] + predictions = model.predict(X_val) + + mae = mean_absolute_error(y_val, predictions) + print(mae) + with open(output_folder / "model.pkl", "wb") as f: + pickle.dump(model, f) + if __name__ == "__main__": typer.run(main)