Skip to content

Commit

Permalink
Results of the live coding (#2)
Browse files Browse the repository at this point in the history
  • Loading branch information
Gramet authored Apr 24, 2024
1 parent 0bc99c9 commit 8e9d5f2
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 16 deletions.
3 changes: 3 additions & 0 deletions .dvc/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/config.local
/tmp
/cache
Empty file added .dvc/config
Empty file.
32 changes: 16 additions & 16 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6,54 +6,54 @@ stages:
deps:
- path: src/pipeline/preprocess
hash: md5
md5: 8260624b4b5a55764d29de5f1693b825.dir
size: 1572
md5: c0103315f729a77ca07420c77735a9de.dir
size: 1066
nfiles: 2
outs:
- path: data/preprocess
hash: md5
md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir
size: 99640
md5: e3e7f02c0d35afc5cccf9786ffdbc061.dir
size: 134027
nfiles: 3
train:
cmd: python -m src.pipeline.train.training --preprocess-folder "data/preprocess/"
--output-folder "data/train/"
deps:
- path: data/preprocess
hash: md5
md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir
size: 99640
md5: e3e7f02c0d35afc5cccf9786ffdbc061.dir
size: 134027
nfiles: 3
- path: src/pipeline/train
hash: md5
md5: 77e525a708041d3b75310f17c6506b86.dir
size: 1469
md5: 97cad4658c6c59e05ab81b676d04abca.dir
size: 1053
nfiles: 2
params:
params.py:
SEED: 42
outs:
- path: data/train
hash: md5
md5: 93fc32f25b693493ec68313a5492d32b.dir
size: 3396162
md5: 5f94e76055cef2085b7f2f8e2d1f8678.dir
size: 3396047
nfiles: 1
evaluate:
cmd: python -m src.pipeline.evaluate.evaluation --preprocess-folder "data/preprocess"
--train-folder "data/train/"
deps:
- path: data/preprocess
hash: md5
md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir
size: 99640
md5: e3e7f02c0d35afc5cccf9786ffdbc061.dir
size: 134027
nfiles: 3
- path: data/train
hash: md5
md5: 93fc32f25b693493ec68313a5492d32b.dir
size: 3396162
md5: 5f94e76055cef2085b7f2f8e2d1f8678.dir
size: 3396047
nfiles: 1
- path: src/pipeline/evaluate
hash: md5
md5: ea95423ef009670bf0d89fd0a9a3ed50.dir
size: 1045
md5: 3a95133c38b6b72f12c0451eed8baa27.dir
size: 1082
nfiles: 2
24 changes: 24 additions & 0 deletions src/pipeline/evaluate/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,36 @@
"""Module for model evaluation."""

import pickle
from pathlib import Path
from statistics import mean

import pandas as pd
import typer
from sklearn.metrics import mean_absolute_error


def main(preprocess_folder: Path = typer.Option(...), train_folder: Path = typer.Option(...)) -> None:
"""Main function."""
df_train = pd.read_parquet(preprocess_folder / "df_train.parquet")
df_test = pd.read_parquet(preprocess_folder / "df_test.parquet")

with open(train_folder / "model.pkl", "rb") as f:
model = pickle.load(f)
target_column = "average_grade"
X_train = df_train.drop(columns=[target_column])
y_train = df_train[target_column]
model.fit(X_train, y_train)

X_test = df_test.drop(columns=[target_column])
y_test = df_test[target_column]
predictions = model.predict(X_test)

baseline_pred = y_train.mean()

mae = mean_absolute_error(y_test, predictions)
mae_baseline = mean_absolute_error(y_test, [baseline_pred] * len(y_test))

print(f"{mae=}, {mae_baseline=}")


if __name__ == "__main__":
Expand Down
18 changes: 18 additions & 0 deletions src/pipeline/preprocess/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,31 @@

from pathlib import Path

import numpy as np
import pandas as pd
import typer
from sklearn.model_selection import train_test_split


def main(input_folder: Path = typer.Option(...), output_folder: Path = typer.Option(...)) -> None:
"""Main function."""
output_folder.mkdir(exist_ok=True, parents=True)

df = pd.read_csv(input_folder / "student-por.csv")

categorical_variables = [col for col, dtype in df.items() if not np.issubdtype(dtype, np.number)]
df = pd.get_dummies(df, columns=categorical_variables, drop_first=False, dtype=float)

df["average_grade"] = (df["G1"] + df["G2"] + df["G3"]) / 3.0
df = df.drop(["G1", "G2", "G3"], axis="columns")

df_train, df_test = train_test_split(df, test_size=0.2)
df_train, df_val = train_test_split(df_train, test_size=0.2)

df_train.to_parquet(output_folder / "df_train.parquet")
df_val.to_parquet(output_folder / "df_val.parquet")
df_test.to_parquet(output_folder / "df_test.parquet")


if __name__ == "__main__":
typer.run(main)
23 changes: 23 additions & 0 deletions src/pipeline/train/training.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,37 @@
"""Module for training the model."""

import pickle
from pathlib import Path

import pandas as pd
import typer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


def main(preprocess_folder: Path = typer.Option(...), output_folder: Path = typer.Option(...)) -> None:
"""Main function."""
output_folder.mkdir(exist_ok=True, parents=True)

df_train = pd.read_parquet(preprocess_folder / "df_train.parquet")
df_val = pd.read_parquet(preprocess_folder / "df_val.parquet")

model = RandomForestRegressor()

target_column = "average_grade"
X_train = df_train.drop(columns=[target_column])
y_train = df_train[target_column]
model.fit(X_train, y_train)

X_val = df_val.drop(columns=[target_column])
y_val = df_val[target_column]
predictions = model.predict(X_val)

mae = mean_absolute_error(y_val, predictions)
print(mae)
with open(output_folder / "model.pkl", "wb") as f:
pickle.dump(model, f)


if __name__ == "__main__":
typer.run(main)

0 comments on commit 8e9d5f2

Please sign in to comment.