-
Notifications
You must be signed in to change notification settings - Fork 4
/
script.py
99 lines (78 loc) · 3.13 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO
import argparse
import joblib
import os
import numpy as np
import pandas as pd
# Loading the model
def model_fn(model_dir):
clf = joblib.load(os.path.join(model_dir, "model.joblib"))
return clf
# script.py will execute line by line
if __name__ == "__main__":
print("[INFO] Extracting arguments")
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script. Specific to Random Forest classifier
parser.add_argument("--n_estimators", type=int, default=100)
parser.add_argument("--random_state", type=int, default=0)
# Data, model, and output directories. Arguments required to be passed to Sagemaker for model training
parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR")) # default
parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN")) # default
parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST")) # default
parser.add_argument("--train-file", type=str, default="train-V-1.csv")
parser.add_argument("--test-file", type=str, default="test-V-1.csv")
args, _ = parser.parse_known_args()
print("SKLearn Version: ", sklearn.__version__)
print("Joblib Version: ", joblib.__version__)
print("[INFO] Reading data")
print()
train_df = pd.read_csv(os.path.join(args.train, args.train_file))
test_df = pd.read_csv(os.path.join(args.test, args.test_file))
features = list(train_df.columns)
label = features.pop(-1)
print("Building training and testing datasets")
print()
X_train = train_df[features]
X_test = test_df[features]
y_train = train_df[label]
y_test = test_df[label]
print('Column order: ')
print(features)
print()
print("Label column is: ",label)
print()
print("Data Shape: ")
print()
print("---- SHAPE OF TRAINING DATA (85%) ----")
print(X_train.shape)
print(y_train.shape)
print()
print("---- SHAPE OF TESTING DATA (15%) ----")
print(X_test.shape)
print(y_test.shape)
print()
print("Training RandomForest Model.....")
print()
model = RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
model.fit(X_train, y_train)
print()
model_path = os.path.join(args.model_dir, "model.joblib")
joblib.dump(model,model_path)
print("Model persisted at " + model_path)
print()
y_pred_test = model.predict(X_test)
test_acc = accuracy_score(y_test,y_pred_test)
test_rep = classification_report(y_test,y_pred_test)
print()
print("---- METRICS RESULTS FOR TESTING DATA ----")
print()
print("Total Rows are: ", X_test.shape[0])
print('[TESTING] Model Accuracy is: ', test_acc)
print('[TESTING] Testing Report: ')
print(test_rep)