Skip to content

Commit

Permalink
save data to parquet. Reformatted some strings
Browse files Browse the repository at this point in the history
  • Loading branch information
Gramet committed Apr 18, 2024
1 parent 4303e89 commit 5a5cc05
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 167 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ matplotlib = "*"
seaborn = "*"
scikit-learn = "*"
shap = "*"
pyarrow = "*"

[dev-packages]
black = "*"
Expand Down
324 changes: 183 additions & 141 deletions Pipfile.lock

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6,54 +6,54 @@ stages:
deps:
- path: src/pipeline/preprocess
hash: md5
md5: eb065a4013762fe2c8ff5b044eddaecb.dir
size: 1635
md5: 8260624b4b5a55764d29de5f1693b825.dir
size: 1572
nfiles: 2
outs:
- path: data/preprocess
hash: md5
md5: 0bef8a54092da8297dfeea73c938e3ac.dir
size: 105893
md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir
size: 99640
nfiles: 3
train:
cmd: python -m src.pipeline.train.training --preprocess-folder "data/preprocess/"
--output-folder "data/train/"
deps:
- path: data/preprocess
hash: md5
md5: 0bef8a54092da8297dfeea73c938e3ac.dir
size: 105893
md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir
size: 99640
nfiles: 3
- path: src/pipeline/train
hash: md5
md5: 4dde5789a75857373cc0acbd95a1a098.dir
size: 1250
md5: 77e525a708041d3b75310f17c6506b86.dir
size: 1469
nfiles: 2
params:
params.py:
SEED: 42
outs:
- path: data/train
hash: md5
md5: aba29454e9c2303472b70dd194f3ea64.dir
size: 3334230
md5: 93fc32f25b693493ec68313a5492d32b.dir
size: 3396162
nfiles: 1
evaluate:
cmd: python -m src.pipeline.evaluate.evaluation --preprocess-folder "data/preprocess"
--train-folder "data/train/"
deps:
- path: data/preprocess
hash: md5
md5: 0bef8a54092da8297dfeea73c938e3ac.dir
size: 105893
md5: 0d3aad6e7d0f13fa5ee381d7304a94ef.dir
size: 99640
nfiles: 3
- path: data/train
hash: md5
md5: aba29454e9c2303472b70dd194f3ea64.dir
size: 3334230
md5: 93fc32f25b693493ec68313a5492d32b.dir
size: 3396162
nfiles: 1
- path: src/pipeline/evaluate
hash: md5
md5: ceb3990296eeb56b57c1a306cd04ab10.dir
size: 1049
md5: ea95423ef009670bf0d89fd0a9a3ed50.dir
size: 1045
nfiles: 2
4 changes: 2 additions & 2 deletions src/pipeline/evaluate/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

def main(preprocess_folder: Path = typer.Option(...), train_folder: Path = typer.Option(...)) -> None:
"""Main function."""
df_test = pd.read_csv(preprocess_folder / "test_data.csv")
df_train = pd.read_csv(preprocess_folder / "train_data.csv")
df_test = pd.read_parquet(preprocess_folder / "test_data.parquet")
df_train = pd.read_parquet(preprocess_folder / "train_data.parquet")

X_test, y_test = split_feature_and_label(df_test)
_, y_train = split_feature_and_label(df_train)
Expand Down
10 changes: 4 additions & 6 deletions src/pipeline/preprocess/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def main(input_folder: Path = typer.Option(...), output_folder: Path = typer.Opt

# Encode categorical
binary_cols = [col for col in df.columns if len(df[col].unique()) == 2]
print(binary_cols)
n_unique_by_column = df.nunique()
binary_cols = n_unique_by_column[n_unique_by_column == 2].index
df = pd.get_dummies(df, columns=binary_cols, drop_first=True, dtype=float) # maybe drop_first False or other method
Expand All @@ -30,13 +29,12 @@ def main(input_folder: Path = typer.Option(...), output_folder: Path = typer.Opt
# Split data
df_train, df_test = train_test_split(df, test_size=0.2)
df_train, df_val = train_test_split(df_train, test_size=0.2)
print(df_train.shape, df_val.shape, df_test.shape)
print(f"{df_train.shape=}, {df_val.shape=}, {df_test.shape=}")

# Save data
# TODO: Save data to parquet
df_train.to_csv(output_folder / "train_data.csv", index=False)
df_val.to_csv(output_folder / "val_data.csv", index=False)
df_test.to_csv(output_folder / "test_data.csv", index=False)
df_train.to_parquet(output_folder / "train_data.parquet", index=False)
df_val.to_parquet(output_folder / "val_data.parquet", index=False)
df_test.to_parquet(output_folder / "test_data.parquet", index=False)


if __name__ == "__main__":
Expand Down
4 changes: 2 additions & 2 deletions src/pipeline/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ def split_feature_and_label(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
def main(preprocess_folder: Path = typer.Option(...), output_folder: Path = typer.Option(...)) -> None:
"""Main function."""
# Load Data
df_train = pd.read_csv(preprocess_folder / "train_data.csv")
df_val = pd.read_csv(preprocess_folder / "val_data.csv")
df_train = pd.read_parquet(preprocess_folder / "train_data.parquet")
df_val = pd.read_parquet(preprocess_folder / "val_data.parquet")

X_train, y_train = split_feature_and_label(df_train)
X_val, y_val = split_feature_and_label(df_val)
Expand Down

0 comments on commit 5a5cc05

Please sign in to comment.