-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathModel-1.py
144 lines (106 loc) · 4.46 KB
/
Model-1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
"""Working_Model.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/15vMWS3F0cKUYSGMFpdKrZkNjuDhfApRm
"""
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import *
from sklearn.model_selection import *
from sklearn.metrics import *
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
train = pd.read_csv("/content/drive/My Drive/train.csv", encoding="utf-8")
test = pd.read_csv("/content/drive/My Drive/test.csv", encoding="utf-8")
train.dtypes
def data_info(data, data_types, dataframe_name):
print(" Information of ",dataframe_name,": Rows = ",data.shape[0],"| Columns = ",data.shape[1],"\n")
data.info()
print("\n")
for VARIABLE in data_types:
data_type = data.select_dtypes(include=[VARIABLE]).dtypes
if len(data_type) > 0:
print(str(len(data_type))+" "+VARIABLE+" Features\n"+str(data_type)+"\n" )
data_types = ["float32","float64","int32","int64","object","category","datetime64[ns]"]
data_info(train, data_types, "train")
def display_head_tail(data, head_size=5, tail_size=5):
print("The head and tail of the data is given as follows:")
display(data.head(head_size).append(data.tail(tail_size)))
display_head_tail(train, 3, 3)
def remove_duplicates(data):
print("Before Removing Duplicate Data: ", data.shape[0])
data.drop_duplicates(keep='first', inplace=True)
print("After Removing Duplicate Data: ", data.shape[0])
remove_duplicates(train)
def handle_missing_values(data, fill_value, fill_types, columns, dataframe_name):
print("Missing Values BEFORE REMOVAL in ", dataframe_name)
display(data.isna().sum())
for column in columns :
# Fill Missing Values with Specific Value :
if "Value_Fill" in fill_types :
data[ column ] = data[ column ].fillna(fill_value)
# print("Value_Fill")
# Fill Missing Values with Forward Fill (Previous Row Value as Current Row in Table) :
if "Forward_Fill" in fill_types :
data[ column ] = data[ column ].ffill(axis = 0)
# print("Forward_Fill")
# Fill Missing Values with Backward Fill (Next Row Value as Current Row in Table) :
if "Backward_Fill" in fill_types :
data[ column ] = data[ column ].bfill(axis = 0)
# print("Backward_Fill")
print("Missing Values AFTER REMOVAL in ",dataframe_name," data")
display(data.isnull().sum())
return data
fill_types = [ "Forward_Fill"]
fill_value = train["Number_Weeks_Used"].median()
train = handle_missing_values(train, fill_value, fill_types, ["Number_Weeks_Used"],"train")
def unique_values(data):
for column in data.columns:
print("Number of unique values are: ", str(data[column].nunique()))
print("Actual Unique Values in "+column+" Column are : "+
str(data[column].sort_values(ascending=True,na_position='last').unique() ))
print("Value Counts :")
print(data[column].value_counts())
print("")
unique_values(train)
def heatmap(df):
plt.figure(figsize=(10,10))
corr = df.corr()
return sns.heatmap(corr, annot=True, cmap='viridis')
def cntplt(df, feature):
plt.figure(figsize=(10, 10))
return sns.countplot(data=df, x=df[feature])
heatmap(train)
cntplt(train, 'Crop_Damage')
sns.catplot(x="Crop_Damage", y="Season", hue="Crop_Damage", kind="bar", data=train);
X = train.drop(labels=['Crop_Damage'], axis=1)
y = train['Crop_Damage']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
y_test.value_counts
X_train_data = X_train.iloc[:, 1:9]
X_test_data = X_test.iloc[:, 1:9]
X_train_data
lgbm = LGBMClassifier()
lgbm_pred = lgbm.fit(X_train_data, y_train)
y_pred = lgbm_pred.predict(X_test_data)
print(accuracy_score(y_pred, y_test))
test2 = test.iloc[:, 1:9]
test_pred = lgbm_pred.predict(test2)
test2['Crop_Damage'] = test_pred
test2['ID'] = test['ID']
test2
output=pd.DataFrame(data={"ID":test2["ID"],"Crop_Damage":test2["Crop_Damage"]}).to_csv("Sol.csv", index=False)
from google.colab import files
files.download('Sol.csv')
output
output=pd.DataFrame(data={"ID":["ID"],"Crop_Damage":y_pred}).to_csv("Sample.csv", index=False)
from google.colab import files
files.download('Sample.csv')