import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
df = pd.read_csv("C:/Users/nvsiv/OneDrive/Desktop/Assignment Heart disease UCI/heart_disease_uci.csv")
df
id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
2 | 3 | 67 | Male | Cleveland | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | Cleveland | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | Cleveland | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
915 | 916 | 54 | Female | VA Long Beach | asymptomatic | 127.0 | 333.0 | True | st-t abnormality | 154.0 | False | 0.0 | NaN | NaN | NaN | 1 |
916 | 917 | 62 | Male | VA Long Beach | typical angina | NaN | 139.0 | False | st-t abnormality | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
917 | 918 | 55 | Male | VA Long Beach | asymptomatic | 122.0 | 223.0 | True | st-t abnormality | 100.0 | False | 0.0 | NaN | NaN | fixed defect | 2 |
918 | 919 | 58 | Male | VA Long Beach | asymptomatic | NaN | 385.0 | True | lv hypertrophy | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
919 | 920 | 62 | Male | VA Long Beach | atypical angina | 120.0 | 254.0 | False | lv hypertrophy | 93.0 | True | 0.0 | NaN | NaN | NaN | 1 |
920 rows × 16 columns
df = df.drop(['dataset'],axis = 1)
df
id | age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | Male | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
2 | 3 | 67 | Male | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
915 | 916 | 54 | Female | asymptomatic | 127.0 | 333.0 | True | st-t abnormality | 154.0 | False | 0.0 | NaN | NaN | NaN | 1 |
916 | 917 | 62 | Male | typical angina | NaN | 139.0 | False | st-t abnormality | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
917 | 918 | 55 | Male | asymptomatic | 122.0 | 223.0 | True | st-t abnormality | 100.0 | False | 0.0 | NaN | NaN | fixed defect | 2 |
918 | 919 | 58 | Male | asymptomatic | NaN | 385.0 | True | lv hypertrophy | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
919 | 920 | 62 | Male | atypical angina | 120.0 | 254.0 | False | lv hypertrophy | 93.0 | True | 0.0 | NaN | NaN | NaN | 1 |
920 rows × 15 columns
df.head()
id | age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | Male | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
2 | 3 | 67 | Male | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
df.tail()
id | age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
915 | 916 | 54 | Female | asymptomatic | 127.0 | 333.0 | True | st-t abnormality | 154.0 | False | 0.0 | NaN | NaN | NaN | 1 |
916 | 917 | 62 | Male | typical angina | NaN | 139.0 | False | st-t abnormality | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
917 | 918 | 55 | Male | asymptomatic | 122.0 | 223.0 | True | st-t abnormality | 100.0 | False | 0.0 | NaN | NaN | fixed defect | 2 |
918 | 919 | 58 | Male | asymptomatic | NaN | 385.0 | True | lv hypertrophy | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
919 | 920 | 62 | Male | atypical angina | 120.0 | 254.0 | False | lv hypertrophy | 93.0 | True | 0.0 | NaN | NaN | NaN | 1 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 920 entries, 0 to 919 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 920 non-null int64 1 age 920 non-null int64 2 sex 920 non-null object 3 cp 920 non-null object 4 trestbps 861 non-null float64 5 chol 890 non-null float64 6 fbs 830 non-null object 7 restecg 918 non-null object 8 thalch 865 non-null float64 9 exang 865 non-null object 10 oldpeak 858 non-null float64 11 slope 611 non-null object 12 ca 309 non-null float64 13 thal 434 non-null object 14 num 920 non-null int64 dtypes: float64(5), int64(3), object(7) memory usage: 107.9+ KB
df.describe(include="all")
id | age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 920.000000 | 920.000000 | 920 | 920 | 861.000000 | 890.000000 | 830 | 918 | 865.000000 | 865 | 858.000000 | 611 | 309.000000 | 434 | 920.000000 |
unique | NaN | NaN | 2 | 4 | NaN | NaN | 2 | 3 | NaN | 2 | NaN | 3 | NaN | 3 | NaN |
top | NaN | NaN | Male | asymptomatic | NaN | NaN | False | normal | NaN | False | NaN | flat | NaN | normal | NaN |
freq | NaN | NaN | 726 | 496 | NaN | NaN | 692 | 551 | NaN | 528 | NaN | 345 | NaN | 196 | NaN |
mean | 460.500000 | 53.510870 | NaN | NaN | 132.132404 | 199.130337 | NaN | NaN | 137.545665 | NaN | 0.878788 | NaN | 0.676375 | NaN | 0.995652 |
std | 265.725422 | 9.424685 | NaN | NaN | 19.066070 | 110.780810 | NaN | NaN | 25.926276 | NaN | 1.091226 | NaN | 0.935653 | NaN | 1.142693 |
min | 1.000000 | 28.000000 | NaN | NaN | 0.000000 | 0.000000 | NaN | NaN | 60.000000 | NaN | -2.600000 | NaN | 0.000000 | NaN | 0.000000 |
25% | 230.750000 | 47.000000 | NaN | NaN | 120.000000 | 175.000000 | NaN | NaN | 120.000000 | NaN | 0.000000 | NaN | 0.000000 | NaN | 0.000000 |
50% | 460.500000 | 54.000000 | NaN | NaN | 130.000000 | 223.000000 | NaN | NaN | 140.000000 | NaN | 0.500000 | NaN | 0.000000 | NaN | 1.000000 |
75% | 690.250000 | 60.000000 | NaN | NaN | 140.000000 | 268.000000 | NaN | NaN | 157.000000 | NaN | 1.500000 | NaN | 1.000000 | NaN | 2.000000 |
max | 920.000000 | 77.000000 | NaN | NaN | 200.000000 | 603.000000 | NaN | NaN | 202.000000 | NaN | 6.200000 | NaN | 3.000000 | NaN | 4.000000 |
df.isnull().sum()
id 0 age 0 sex 0 cp 0 trestbps 59 chol 30 fbs 90 restecg 2 thalch 55 exang 55 oldpeak 62 slope 309 ca 611 thal 486 num 0 dtype: int64
df['cp'].value_counts()
asymptomatic 496 non-anginal 204 atypical angina 174 typical angina 46 Name: cp, dtype: int64
encoders1= {'True':1, 'False':0}
df['fbs'] = df['fbs'].replace(encoders1)
encoders2 = {'asymptomatic':0, 'non-anginal':1, 'atypical angina': 2, 'typical angina': 3}
df['cp'] = df['cp'].replace(encoders2)
df['exang'] = df['exang'].replace(encoders1)
slope_1 = {'flat':0, 'upsloping':1, 'downsloping':2}
df['slope'] = df['slope'].replace(slope_1)
ca_1 = {'0.935653':4, '0.000000':0, '1.000000':1, '2.000000':2, '3.000000':3}
df['ca'] = df['ca'].replace(ca_1)
restecg_1 = {'normal':0, 'lv hypertrophy':1, 'st-t abnormality': 2}
df['restecg'] = df['restecg'].replace(restecg_1)
thal_1 = {'normal':0, 'reversable defect':1, 'fixed defect': 2}
df['thal'] = df['thal'].replace(thal_1)
sex_1 = {'Male':1, 'Female':0}
df['sex'] = df['sex'].replace(sex_1)
df['num'].replace([0,1,2,3,4],['absent','present','present','present','present'], inplace = True)
df['num'].value_counts(normalize = True)
present 0.553261 absent 0.446739 Name: num, dtype: float64
df['num'].replace(['present','absent'],[1,0], inplace = True)
df['num'].value_counts(normalize = True)
1 0.553261 0 0.446739 Name: num, dtype: float64
# numerical
df['trestbps'].fillna(df['trestbps'].mean(),inplace = True)
df['chol'].fillna(df['chol'].mean(),inplace = True)
df['thalch'].fillna(df['thalch'].mean(),inplace = True)
df['oldpeak'].fillna(df['oldpeak'].mean(),inplace = True)
df['ca'].fillna(df['ca'].std(),inplace = True)
# categorical
df['exang'].fillna(df['exang'].mean(),inplace = True)
df['fbs'].fillna(df['fbs'].mean(),inplace = True)
df['slope'].fillna(df['slope'].mean(),inplace = True)
df['restecg'].fillna(df['restecg'].mean(),inplace = True)
df['thal'].fillna(df['thal'].mean(),inplace = True)
df.isnull().sum()
id 0 age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalch 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 num 0 dtype: int64
df.describe()
id | age | sex | cp | trestbps | chol | restecg | thalch | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 |
mean | 460.500000 | 53.510870 | 0.789130 | 0.750000 | 132.132404 | 199.130337 | 0.594771 | 137.545665 | 0.878788 | 0.538462 | 0.848570 | 0.654378 | 0.553261 |
std | 265.725422 | 9.424685 | 0.408148 | 0.930969 | 18.443895 | 108.957634 | 0.793921 | 25.138494 | 1.053774 | 0.549850 | 0.555351 | 0.454881 | 0.497426 |
min | 1.000000 | 28.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 60.000000 | -2.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 230.750000 | 47.000000 | 1.000000 | 0.000000 | 120.000000 | 177.750000 | 0.000000 | 120.000000 | 0.000000 | 0.000000 | 0.935653 | 0.654378 | 0.000000 |
50% | 460.500000 | 54.000000 | 1.000000 | 0.000000 | 130.000000 | 221.000000 | 0.000000 | 138.000000 | 0.800000 | 0.538462 | 0.935653 | 0.654378 | 1.000000 |
75% | 690.250000 | 60.000000 | 1.000000 | 1.000000 | 140.000000 | 267.000000 | 1.000000 | 156.000000 | 1.500000 | 1.000000 | 0.935653 | 1.000000 | 1.000000 |
max | 920.000000 | 77.000000 | 1.000000 | 3.000000 | 200.000000 | 603.000000 | 2.000000 | 202.000000 | 6.200000 | 2.000000 | 3.000000 | 2.000000 | 1.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 920 entries, 0 to 919 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 920 non-null int64 1 age 920 non-null int64 2 sex 920 non-null int64 3 cp 920 non-null int64 4 trestbps 920 non-null float64 5 chol 920 non-null float64 6 fbs 920 non-null object 7 restecg 920 non-null float64 8 thalch 920 non-null float64 9 exang 920 non-null object 10 oldpeak 920 non-null float64 11 slope 920 non-null float64 12 ca 920 non-null float64 13 thal 920 non-null float64 14 num 920 non-null int64 dtypes: float64(8), int64(5), object(2) memory usage: 107.9+ KB
Analysis Part:
sns.displot(df['sex'])
<seaborn.axisgrid.FacetGrid at 0x1fa8760be50>
sns.displot(df['cp'])
<seaborn.axisgrid.FacetGrid at 0x1fa839b3850>
sns.displot(df['age'])
<seaborn.axisgrid.FacetGrid at 0x1fa874eca60>
sns.displot(df['thal'])
<seaborn.axisgrid.FacetGrid at 0x1fa874ccb80>
sns.displot(df['restecg'])
<seaborn.axisgrid.FacetGrid at 0x1fa839cdee0>
sns.displot(df['slope'])
<seaborn.axisgrid.FacetGrid at 0x1fa875be0d0>
df.hist(column = "num", by = "sex", bins = 10)
array([<AxesSubplot:title={'center':'0'}>, <AxesSubplot:title={'center':'1'}>], dtype=object)
Heat Map
plt.figure(figsize=(15,15))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
<AxesSubplot:>
Building a Model
x = df.iloc[:,0:14]
y = df.iloc[:,-1]
x.head()
id | age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | 3 | 145.0 | 233.0 | True | 1.0 | 150.0 | False | 2.3 | 2.0 | 0.0 | 2.0 |
1 | 2 | 67 | 1 | 0 | 160.0 | 286.0 | False | 1.0 | 108.0 | True | 1.5 | 0.0 | 3.0 | 0.0 |
2 | 3 | 67 | 1 | 0 | 120.0 | 229.0 | False | 1.0 | 129.0 | True | 2.6 | 0.0 | 2.0 | 1.0 |
3 | 4 | 37 | 1 | 1 | 130.0 | 250.0 | False | 0.0 | 187.0 | False | 3.5 | 2.0 | 0.0 | 0.0 |
4 | 5 | 41 | 0 | 2 | 130.0 | 204.0 | False | 1.0 | 172.0 | False | 1.4 | 1.0 | 0.0 | 0.0 |
y.head()
0 0 1 1 2 1 3 0 4 0 Name: num, dtype: int64
minmax=preprocessing.MinMaxScaler(feature_range=(0,1))
minmax.fit(x).transform(x)
array([[0. , 0.71428571, 1. , ..., 1. , 0. , 1. ], [0.00108814, 0.79591837, 1. , ..., 0. , 1. , 0. ], [0.00217628, 0.79591837, 1. , ..., 0. , 0.66666667, 0.5 ], ..., [0.99782372, 0.55102041, 1. , ..., 0.26923077, 0.31188434, 1. ], [0.99891186, 0.6122449 , 1. , ..., 0.26923077, 0.31188434, 0.32718894], [1. , 0.69387755, 1. , ..., 0.26923077, 0.31188434, 0.32718894]])
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, random_state = 42, shuffle = True)
x_train.head()
id | age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
880 | 881 | 62 | 1 | 0 | 132.132404 | 170.000000 | False | 2.0 | 120.000000 | True | 3.000000 | 0.538462 | 0.935653 | 0.654378 |
457 | 458 | 54 | 1 | 1 | 150.000000 | 199.130337 | False | 0.0 | 122.000000 | False | 0.000000 | 0.538462 | 0.935653 | 0.654378 |
797 | 798 | 51 | 1 | 1 | 132.132404 | 339.000000 | False | 0.0 | 137.545665 | 0.389595 | 0.878788 | 0.538462 | 0.935653 | 0.654378 |
25 | 26 | 50 | 0 | 1 | 120.000000 | 219.000000 | False | 0.0 | 158.000000 | False | 1.600000 | 0.000000 | 0.000000 | 0.000000 |
84 | 85 | 52 | 1 | 2 | 120.000000 | 325.000000 | False | 0.0 | 172.000000 | False | 0.200000 | 1.000000 | 0.000000 | 0.000000 |
y_train.head()
880 1 457 0 797 1 25 0 84 0 Name: num, dtype: int64
KNN Classifier
#Fitting Classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(x_train,y_train)
KNeighborsClassifier()
#Predict on test data
y_pred=clf.predict(x_test)
print("Predicted Heart disease : ")
print(y_pred)
Predicted Heart disease : [0 0 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1]
print("Actual Heart disease : ")
print(y_test.values)
Actual Heart disease : [0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix
print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))
Accuracy score: 80.978261
print(confusion_matrix(y_test, y_pred))
[[63 12] [23 86]]
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_pred, y_test)
roc_auc = auc(fpr, tpr)
#plt.figure()
plt.plot(fpr, tpr, color='darkorange',
label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))
ROC score : 81.449541
Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rmf = RandomForestClassifier(max_depth=3, random_state=42)
rmf_clf = rmf.fit(x_train, y_train)
#Predict on test data
y_pred=rmf.predict(x_test)
print("Predicted Heart disease : ")
print(y_pred)
Predicted Heart disease : [0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]
print("Actual Heart disease : ")
print(y_test.values)
Actual Heart disease : [0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]
print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))
print("Recall score : %f" %(recall_score(y_test, y_pred) * 100))
print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))
print(confusion_matrix(y_test, y_pred))
Accuracy score: 88.586957 Recall score : 90.825688 ROC score : 88.079511 [[64 11] [10 99]]
Navies Bayes Classification
from sklearn.naive_bayes import GaussianNB
nb_classifier =GaussianNB()
nb_classifier.fit(x_train, y_train)
GaussianNB()
y_pred=nb_classifier.predict(x_test)
print("Predicted Heart disease : ")
print(y_pred)
Predicted Heart disease : [0 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 0 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1]
print("Actual Heart disease : ")
print(y_test.values)
Actual Heart disease : [0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]
print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))
print("Recall score : %f" %(recall_score(y_test, y_pred) * 100))
print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))
print(confusion_matrix(y_test, y_pred))
Accuracy score: 81.521739 Recall score : 78.899083 ROC score : 82.116208 [[64 11] [23 86]]
Pipelining
pipe_knn = Pipeline([('scl', MinMaxScaler()),
('pca', PCA(n_components=2)),
('clf', KNeighborsClassifier())])
pipe_rf = Pipeline([('scl', MinMaxScaler()),
('pca', PCA(n_components=3)),
('clf', RandomForestClassifier(random_state=42))])
pipe_nb = Pipeline([('scl', MinMaxScaler()),
('pca', PCA(n_components=2)),
('clf', GaussianNB())])
pipelines = [pipe_knn, pipe_rf, pipe_nb]
pipelines[1]
Pipeline(steps=[('scl', MinMaxScaler()), ('pca', PCA(n_components=3)), ('clf', RandomForestClassifier(random_state=42))])
pipe_dict = {0: 'kNN Classifier', 1: 'Random Forest', 2: 'Navies Bayes'}
for pipe in pipelines:
pipe.fit(x_train, y_train)
for idx, val in enumerate(pipelines):
print('%s pipeline test accuracy: %.2f' %
(pipe_dict[idx], val.score(x_test, y_test)))
kNN Classifier pipeline test accuracy: 0.82 Random Forest pipeline test accuracy: 0.85 Navies Bayes pipeline test accuracy: 0.80
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
if val.score(x_test, y_test) > best_acc:
best_acc = val.score(x_test, y_test)
best_pipe = val
best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])
Classifier with best accuracy: Random Forest