- uci machine learning diabetes dataset used for predicition
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
diabetes=pd.read_csv("diabetes.csv")
print(diabetes.shape)
(768, 9)
diabetes.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
- check databalance which is quite good
diabetes.Outcome.value_counts()
0 500
1 268
Name: Outcome, dtype: int64
- ohhh! 34% adults suffer from diabetes
import seaborn as sns
sns.countplot(diabetes['Outcome'],label="Count")
<matplotlib.axes._subplots.AxesSubplot at 0x2db63f3d828>
diabetes.isna().sum()
Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
BMI 0
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64
diabetes.dtypes
Pregnancies int64
Glucose int64
BloodPressure int64
SkinThickness int64
Insulin int64
BMI float64
DiabetesPedigreeFunction float64
Age int64
Outcome int64
dtype: object
diabetes.columns.tolist()
['Pregnancies',
'Glucose',
'BloodPressure',
'SkinThickness',
'Insulin',
'BMI',
'DiabetesPedigreeFunction',
'Age',
'Outcome']
- check outlier and gaussian shape
diabetes[['Pregnancies',
'Glucose',
'BloodPressure',
'SkinThickness',
'Insulin',
'BMI',
'DiabetesPedigreeFunction',
'Age']].hist(figsize=(16, 10), bins=50, xlabelsize=8, ylabelsize=8);
- Pregnancies more than 10 is ideally not good so we consider it as outlier
- Body mass index is weight to height ration so weight less than 12 is not range of adults so we consider it as outlier
- bloodpressure lower than 40 is criticly low pressure so we consider it as outlier
- Glucose lower than 40 is criticly low pressure so we consider it as outlier
- SkinThickness lower than 60 is criticly low pressure so we consider it as outlier
removed all outlier
diabetes=diabetes[diabetes['Pregnancies']<=11]
diabetes=diabetes[diabetes['BMI']>=12]
diabetes=diabetes[diabetes['BloodPressure']>40]
diabetes=diabetes[diabetes['Glucose']>40]
diabetes=diabetes[diabetes['SkinThickness']<60]
diabetes[['Pregnancies',
'Glucose',
'BloodPressure',
'SkinThickness',
'Insulin',
'BMI',
'DiabetesPedigreeFunction',
'Age']].hist(figsize=(16, 10), bins=50, xlabelsize=8, ylabelsize=8);
- after removal of outlier you can check distribution is likely normal
- we are going to use correlation for finding independent variable correlation
Assumption For PCC
- data should be normalize so we make it by normalizer
- linear you can check some features to be linear
- normal distributed
- applied always on continues variables
from sklearn.preprocessing import Normalizer
normalized_application = Normalizer().fit_transform(diabetes[['Pregnancies',
'Glucose',
'BloodPressure',
'SkinThickness',
'Insulin',
'BMI',
'DiabetesPedigreeFunction',
'Age']])
#print (normalized_application)
normal_df=pd.DataFrame(normalized_application)
normal_df.columns=['Pregnancies',
'Glucose',
'BloodPressure',
'SkinThickness',
'Insulin',
'BMI',
'DiabetesPedigreeFunction',
'Age']
cor=normal_df.corr()
plt.figure(figsize=(8,8))
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
sns.heatmap(cor, vmax=1, square=True,annot=True,cmap=flatui)
plt.title('Correlation between different fearures')
Text(0.5,1,'Correlation between different fearures')
- Pregnancies and age correlated which is right
- Glucose and BMI,age correlated which is right
- Blood pressure related to Glucose,BMI and Age which gives instinct about diabetes
as per the interpreation we are going to retain all independet variables for model
colnames=['Pregnancies',
'Glucose',
'BloodPressure',
'SkinThickness',
'Insulin',
'BMI',
'DiabetesPedigreeFunction',
'Age']
sns.pairplot(cor[colnames],size=1.5,x_vars=colnames,y_vars=colnames)
plt.show()
diabetes.isna().sum()
Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
BMI 0
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64
normal_df.isna().sum()
Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
BMI 0
DiabetesPedigreeFunction 0
Age 0
dtype: int64
print(diabetes.shape)
print(normal_df.shape)
(694, 9)
(694, 8)
diatbetes=diabetes.reset_index(drop=True)
normal_df=normal_df.reset_index(drop=True)
normal_df['Outcome'] = diabetes.Outcome.values
from sklearn.model_selection import train_test_split
df_train, df_val = train_test_split(normal_df, test_size=0.30)
print(df_train.shape)
print(df_val.shape)
(485, 9)
(209, 9)
- with good algorithm we can achieve more accuracy
from sklearn.linear_model import LogisticRegression
features = normal_df.drop(["Outcome"], axis=1).columns
logreg = LogisticRegression().fit(df_train[features], df_train['Outcome'])
print("Training set score: {:.3f}".format(logreg.score(df_train[features], df_train['Outcome'])))
print("Validation set score: {:.3f}".format(logreg.score(df_val[features], df_val['Outcome'])))
Training set score: 0.682
Validation set score: 0.622
testdiabetes=pd.read_csv("testdiabetes.csv")
print(testdiabetes.shape)
(1, 8)
normalized = Normalizer().fit_transform(testdiabetes)
#print (normalized_application)
normal_test=pd.DataFrame(normalized)
normal_test.columns=['Pregnancies',
'Glucose',
'BloodPressure',
'SkinThickness',
'Insulin',
'BMI',
'DiabetesPedigreeFunction',
'Age']
prediction=logreg.predict(normal_test)
print(prediction)
[0]