Skip to content

Commit

Permalink
inital commit to fix the file names bug
Browse files Browse the repository at this point in the history
Signed-off-by: Ahmed Sheta <ahmed.sheta@fau.de>
  • Loading branch information
ultiwinter7 committed Feb 3, 2024
1 parent 558296f commit 4d99756
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 31 deletions.
4 changes: 3 additions & 1 deletion src/database/leads/local_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,9 @@ def save_classification_report(self, report, model_name: str):
except Exception as e:
log.error(f"Could not save report at {report_file_path}! Error: {str(e)}")

def load_preprocessed_data(self, file_name: str = "preprocessed_data.csv"):
def load_preprocessed_data(
self, file_name: str = "historical_preprocessed_data.csv"
):
try:
return pd.read_csv(os.path.join(self.DF_PREPROCESSED_INPUT, file_name))
except FileNotFoundError:
Expand Down
4 changes: 3 additions & 1 deletion src/database/leads/s3_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,9 @@ def save_classification_report(self, report, model_name: str):
except Exception as e:
log.error(f"Could not save report for '{model_name}' to S3: {str(e)}")

def load_preprocessed_data(self, file_name: str = "preprocessed_data.csv"):
def load_preprocessed_data(
self, file_name: str = "historical_preprocessed_data.csv"
):
file_path = self.DF_PREPROCESSED_INPUT + file_name
if not file_path.startswith("s3://"):
log.error(
Expand Down
72 changes: 69 additions & 3 deletions src/demo/demos.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import re
import subprocess

import pandas as pd
import xgboost as xgb
from sklearn.metrics import classification_report

Expand Down Expand Up @@ -40,6 +41,7 @@
INPUT_FILE_BDC = "../data/sumup_leads_email.csv"
OUTPUT_FILE_BDC = "../data/collected_data.json"


# evp demo
def evp_demo():
data = get_database().load_preprocessed_data()
Expand Down Expand Up @@ -212,19 +214,71 @@ def pipeline_demo():


def preprocessing_demo():
if get_yes_no_input("Filter out the API-irrelevant data? (y/n)"):
if get_yes_no_input("Filter out the API-irrelevant data? (y/n)\n"):
filter_bool = True
else:
filter_bool = False
if get_yes_no_input(
"Run on historical data ? (y/n)\nNote: DATABASE_TYPE should be S3!"
"Run on historical data ? (y/n)\n'n' means it will run on lead data!\n"
):
historical_bool = True
else:
historical_bool = False
if get_yes_no_input("Run on S3? (y/n)\n'n' means it will run locally!\n"):
S3_bool = True
else:
S3_bool = False

preprocessor = Preprocessing(
filter_null_data=filter_bool, historical_data=historical_bool
)
if historical_bool and S3_bool:
preprocessor.data_path = (
"s3://amos--data--events/historical_data/100k_historic_enriched.csv"
)
preprocessor.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/historical_preprocessed_data.csv"
elif historical_bool and not S3_bool:
# input path
input_path_components = preprocessor.data_path.split(
"\\" if "\\" in preprocessor.data_path else "/"
)
input_path_components.pop()
input_path_components.append("100k_historic_enriched.csv")
input_path = "/".join(input_path_components)
preprocessor.data_path = input_path

# output path
path_components = preprocessor.data_path.split(
"\\" if "\\" in preprocessor.data_path else "/"
)
path_components.pop()
path_components.append(
"preprocessed_data_files/historical_preprocessed_data.csv"
)
preprocessor.prerocessed_data_output_path = "/".join(path_components)
elif not historical_bool and S3_bool:
preprocessor.data_path = "s3://amos--data--events/leads/enriched.csv"
preprocessor.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/leads_preprocessed_data.csv"
elif not historical_bool and not S3_bool:
# input path
input_path_components = preprocessor.data_path.split(
"\\" if "\\" in preprocessor.data_path else "/"
)
input_path_components.pop()
input_path_components.append("leads_enriched.csv")
input_path = "/".join(input_path_components)
preprocessor.data_path = input_path

# output path
path_components = preprocessor.data_path.split(
"\\" if "\\" in preprocessor.data_path else "/"
)
path_components.pop()
path_components.append("preprocessed_data_files/leads_preprocessed_data.csv")
preprocessor.prerocessed_data_output_path = "/".join(path_components)

preprocessor.preprocessed_df = pd.read_csv(preprocessor.data_path)

df = preprocessor.implement_preprocessing_pipeline()
preprocessor.save_preprocessed_data()

Expand All @@ -250,7 +304,18 @@ def predict_MerchantSize_on_lead_data_demo():
from preprocessing import Preprocessing

preprocessor = Preprocessing(filter_null_data=False, historical_data=False)
leads_enriched_path = "s3://amos--data--events/leads/enriched.csv"

leads_enriched_path = "s3://amos--data--events/leads/enriched.csv" # S3 path

# # input path
# input_path_components = preprocessor.data_path.split(
# "\\" if "\\" in preprocessor.data_path else "/"
# )
# input_path_components.pop()
# input_path_components.append("leads_enriched.csv")
# input_path = "/".join(input_path_components) # local path
# preprocessor.data_path = input_path

if not leads_enriched_path:
log.error(
"No such file exists in the directory s3://amos--data--events/leads/enriched.csv"
Expand All @@ -259,6 +324,7 @@ def predict_MerchantSize_on_lead_data_demo():
preprocessor.prerocessed_data_output_path = (
"s3://amos--data--events/leads/preprocessed_leads_data.csv"
)
preprocessor.preprocessed_df = pd.read_csv(leads_enriched_path)
df = preprocessor.implement_preprocessing_pipeline()
preprocessor.save_preprocessed_data()

Expand Down
58 changes: 32 additions & 26 deletions src/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,31 +32,34 @@ class Preprocessing:
def __init__(self, filter_null_data=True, historical_data=False):
data_repo = get_database()
self.data_path = data_repo.get_output_path()
if historical_data:
input_path_components = self.data_path.split(
"\\" if "\\" in self.data_path else "/"
)
input_path_components.pop()
input_path_components.pop()
input_path_components.append("historical_data/100k_historic_enriched.csv")
input_path = "/".join(input_path_components)
data = pd.read_csv(input_path)
log.debug(f"Data path = {input_path}")
else:
log.debug(f"Data path = {self.data_path}")
data = pd.read_csv(self.data_path)
self.preprocessed_df = data.copy()

if historical_data:
self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv"
else:
# created the new output path based on which repo used
path_components = self.data_path.split(
"\\" if "\\" in self.data_path else "/"
)
path_components.pop()
path_components.append("preprocessed_data.csv")
self.prerocessed_data_output_path = "/".join(path_components)
self.preprocessed_df = None
self.prerocessed_data_output_path = None
# if historical_data:
# input_path_components = self.data_path.split(
# "\\" if "\\" in self.data_path else "/"
# )
# input_path_components.pop()
# input_path_components.pop()
# input_path_components.append("historical_data/100k_historic_enriched.csv")
# input_path = "/".join(input_path_components)
# data = pd.read_csv(input_path)
# log.debug(f"Data path = {input_path}")
# self.preprocessed_df = data.copy()
# else:
# log.debug(f"Data path = {self.data_path}")
# data = pd.read_csv(self.data_path)
# self.preprocessed_df = data.copy()

# if historical_data:
# self.prerocessed_data_output_path = "s3://amos--data--features/preprocessed_data_files/preprocessed_data.csv"
# else:
# # created the new output path based on which repo used
# path_components = self.data_path.split(
# "\\" if "\\" in self.data_path else "/"
# )
# path_components.pop()
# path_components.append("preprocessed_data_files/preprocessed_data.csv")
# self.prerocessed_data_output_path = "/".join(path_components)

self.filter_bool = filter_null_data
# columns that would be added later after one-hot encoding each class
Expand Down Expand Up @@ -114,7 +117,10 @@ def filter_out_null_data(self):
]

def fill_missing_values(self, column, strategy="constant"):
if column in self.preprocessed_df.columns:
if (
column in self.preprocessed_df.columns
and not self.preprocessed_df[column].empty
):
imputer = SimpleImputer(strategy=strategy)
self.preprocessed_df[column] = imputer.fit_transform(
self.preprocessed_df[[column]]
Expand Down

0 comments on commit 4d99756

Please sign in to comment.