This is a multivariate type of dataset which means providing or involving a variety of separate mathematical or statistical variables, multivariate numerical data analysis.
It is composed of 14 attributes which are age, sex, chest pain type, resting blood pressure, serum cholesterol, fasting blood sugar, resting electrocardiographic results, maximum heart rate achieved,exercise-induced angina, oldpeak — ST depression induced by exercise relative to rest, the slope of the peak exercise ST segment, number of major vessels and Thalassemia.
This database includes 76 attributes, but all published studies relate to the use of a subset of 14 of them.
The Cleveland database is the only one used by ML researchers to date. One of the major tasks on this dataset is to predict based on the given attributes of a patient that whether that particular person has heart disease or not and other is the experimental task to diagnose and find out various insights from this dataset which could help in understanding the problem more.
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
path = 'C:\\Users\\mahit\\OneDrive\\Desktop\\DSPP\\ML & DL\\Assignments\\JNTUH ML DL assignment 3\\'
df = pd.read_csv(path+'heart_disease_uci.csv')
df
id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
2 | 3 | 67 | Male | Cleveland | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | Cleveland | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | Cleveland | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
915 | 916 | 54 | Female | VA Long Beach | asymptomatic | 127.0 | 333.0 | True | st-t abnormality | 154.0 | False | 0.0 | NaN | NaN | NaN | 1 |
916 | 917 | 62 | Male | VA Long Beach | typical angina | NaN | 139.0 | False | st-t abnormality | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
917 | 918 | 55 | Male | VA Long Beach | asymptomatic | 122.0 | 223.0 | True | st-t abnormality | 100.0 | False | 0.0 | NaN | NaN | fixed defect | 2 |
918 | 919 | 58 | Male | VA Long Beach | asymptomatic | NaN | 385.0 | True | lv hypertrophy | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
919 | 920 | 62 | Male | VA Long Beach | atypical angina | 120.0 | 254.0 | False | lv hypertrophy | 93.0 | True | 0.0 | NaN | NaN | NaN | 1 |
920 rows × 16 columns
df.head()
id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
2 | 3 | 67 | Male | Cleveland | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | Cleveland | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | Cleveland | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
df.tail()
id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
915 | 916 | 54 | Female | VA Long Beach | asymptomatic | 127.0 | 333.0 | True | st-t abnormality | 154.0 | False | 0.0 | NaN | NaN | NaN | 1 |
916 | 917 | 62 | Male | VA Long Beach | typical angina | NaN | 139.0 | False | st-t abnormality | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
917 | 918 | 55 | Male | VA Long Beach | asymptomatic | 122.0 | 223.0 | True | st-t abnormality | 100.0 | False | 0.0 | NaN | NaN | fixed defect | 2 |
918 | 919 | 58 | Male | VA Long Beach | asymptomatic | NaN | 385.0 | True | lv hypertrophy | NaN | NaN | NaN | NaN | NaN | NaN | 0 |
919 | 920 | 62 | Male | VA Long Beach | atypical angina | 120.0 | 254.0 | False | lv hypertrophy | 93.0 | True | 0.0 | NaN | NaN | NaN | 1 |
df = df.drop(['dataset'],axis = 1)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 920 entries, 0 to 919 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 920 non-null int64 1 age 920 non-null int64 2 sex 920 non-null object 3 cp 920 non-null object 4 trestbps 861 non-null float64 5 chol 890 non-null float64 6 fbs 830 non-null object 7 restecg 918 non-null object 8 thalch 865 non-null float64 9 exang 865 non-null object 10 oldpeak 858 non-null float64 11 slope 611 non-null object 12 ca 309 non-null float64 13 thal 434 non-null object 14 num 920 non-null int64 dtypes: float64(5), int64(3), object(7) memory usage: 107.9+ KB
df.isnull().sum()
id 0 age 0 sex 0 cp 0 trestbps 59 chol 30 fbs 90 restecg 2 thalch 55 exang 55 oldpeak 62 slope 309 ca 611 thal 486 num 0 dtype: int64
df.describe()
id | age | trestbps | chol | thalch | oldpeak | ca | num | |
---|---|---|---|---|---|---|---|---|
count | 920.000000 | 920.000000 | 861.000000 | 890.000000 | 865.000000 | 858.000000 | 309.000000 | 920.000000 |
mean | 460.500000 | 53.510870 | 132.132404 | 199.130337 | 137.545665 | 0.878788 | 0.676375 | 0.995652 |
std | 265.725422 | 9.424685 | 19.066070 | 110.780810 | 25.926276 | 1.091226 | 0.935653 | 1.142693 |
min | 1.000000 | 28.000000 | 0.000000 | 0.000000 | 60.000000 | -2.600000 | 0.000000 | 0.000000 |
25% | 230.750000 | 47.000000 | 120.000000 | 175.000000 | 120.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 460.500000 | 54.000000 | 130.000000 | 223.000000 | 140.000000 | 0.500000 | 0.000000 | 1.000000 |
75% | 690.250000 | 60.000000 | 140.000000 | 268.000000 | 157.000000 | 1.500000 | 1.000000 | 2.000000 |
max | 920.000000 | 77.000000 | 200.000000 | 603.000000 | 202.000000 | 6.200000 | 3.000000 | 4.000000 |
df['cp'].value_counts()
asymptomatic 496 non-anginal 204 atypical angina 174 typical angina 46 Name: cp, dtype: int64
fbs_1 = {'True':1, 'False':0}
df['fbs'] = df['fbs'].replace(fbs_1)
cp_1 = {'asymptomatic':0, 'non-anginal':1, 'atypical angina': 2, 'typical angina': 3}
df['cp'] = df['cp'].replace(cp_1)
exang_1 = {'True':1, 'False':0}
df['exang'] = df['exang'].replace(exang_1)
slope_1 = {'flat':0, 'upsloping':1, 'downsloping':2}
df['slope'] = df['slope'].replace(slope_1)
ca_1 = {'0.935653':4, '0.000000':0, '1.000000':1, '2.000000':2, '3.000000':3}
df['ca'] = df['ca'].replace(ca_1)
restecg_1 = {'normal':0, 'lv hypertrophy':1, 'st-t abnormality': 2}
df['restecg'] = df['restecg'].replace(restecg_1)
thal_1 = {'normal':0, 'reversable defect':1, 'fixed defect': 2}
df['thal'] = df['thal'].replace(thal_1)
sex_1 = {'Male':1, 'Female':0}
df['sex'] = df['sex'].replace(sex_1)
df['num'].replace([0,1,2,3,4],['absent','present','present','present','present'], inplace = True)
df['num'].value_counts(normalize = True)
present 0.553261 absent 0.446739 Name: num, dtype: float64
df['num'].replace(['present','absent'],[1,0], inplace = True)
df['num'].value_counts(normalize = True)
1 0.553261 0 0.446739 Name: num, dtype: float64
# Numerical
df['trestbps'].fillna(df['trestbps'].mean(),inplace = True)
df['chol'].fillna(df['chol'].mean(),inplace = True)
df['thalch'].fillna(df['thalch'].mean(),inplace = True)
df['oldpeak'].fillna(df['oldpeak'].mean(),inplace = True)
df['ca'].fillna(df['ca'].std(),inplace = True)
# Categorical
df['exang'].fillna(df['exang'].mean(),inplace = True)
df['fbs'].fillna(df['fbs'].mean(),inplace = True)
df['slope'].fillna(df['slope'].mean(),inplace = True)
df['restecg'].fillna(df['restecg'].mean(),inplace = True)
df['thal'].fillna(df['thal'].mean(),inplace = True)
df.isnull().sum()
id 0 age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalch 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 num 0 dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 920 entries, 0 to 919 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 920 non-null int64 1 age 920 non-null int64 2 sex 920 non-null int64 3 cp 920 non-null int64 4 trestbps 920 non-null float64 5 chol 920 non-null float64 6 fbs 920 non-null object 7 restecg 920 non-null float64 8 thalch 920 non-null float64 9 exang 920 non-null object 10 oldpeak 920 non-null float64 11 slope 920 non-null float64 12 ca 920 non-null float64 13 thal 920 non-null float64 14 num 920 non-null int64 dtypes: float64(8), int64(5), object(2) memory usage: 107.9+ KB
df.describe()
id | age | sex | cp | trestbps | chol | restecg | thalch | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 | 920.000000 |
mean | 460.500000 | 53.510870 | 0.789130 | 0.750000 | 132.132404 | 199.130337 | 0.594771 | 137.545665 | 0.878788 | 0.538462 | 0.848570 | 0.654378 | 0.553261 |
std | 265.725422 | 9.424685 | 0.408148 | 0.930969 | 18.443895 | 108.957634 | 0.793921 | 25.138494 | 1.053774 | 0.549850 | 0.555351 | 0.454881 | 0.497426 |
min | 1.000000 | 28.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 60.000000 | -2.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 230.750000 | 47.000000 | 1.000000 | 0.000000 | 120.000000 | 177.750000 | 0.000000 | 120.000000 | 0.000000 | 0.000000 | 0.935653 | 0.654378 | 0.000000 |
50% | 460.500000 | 54.000000 | 1.000000 | 0.000000 | 130.000000 | 221.000000 | 0.000000 | 138.000000 | 0.800000 | 0.538462 | 0.935653 | 0.654378 | 1.000000 |
75% | 690.250000 | 60.000000 | 1.000000 | 1.000000 | 140.000000 | 267.000000 | 1.000000 | 156.000000 | 1.500000 | 1.000000 | 0.935653 | 1.000000 | 1.000000 |
max | 920.000000 | 77.000000 | 1.000000 | 3.000000 | 200.000000 | 603.000000 | 2.000000 | 202.000000 | 6.200000 | 2.000000 | 3.000000 | 2.000000 | 1.000000 |
sns.displot(df['sex'])
<seaborn.axisgrid.FacetGrid at 0x2062e4e3a60>
sns.displot(df['cp'])
<seaborn.axisgrid.FacetGrid at 0x20639f05580>
sns.displot(df['age'])
<seaborn.axisgrid.FacetGrid at 0x206354b7220>
sns.displot(df['thal'])
<seaborn.axisgrid.FacetGrid at 0x20639f2f850>
sns.displot(df['restecg'])
<seaborn.axisgrid.FacetGrid at 0x2063af85e20>
sns.displot(df['slope'])
<seaborn.axisgrid.FacetGrid at 0x2063b0b6d90>
df.hist(column = "num", by = "sex", bins = 10)
array([<AxesSubplot:title={'center':'0'}>, <AxesSubplot:title={'center':'1'}>], dtype=object)
plt.figure(figsize=(15,15))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
<AxesSubplot:>
x = df.iloc[:,0:14]
y = df.iloc[:,-1]
x.head()
id | age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | 3 | 145.0 | 233.0 | True | 1.0 | 150.0 | False | 2.3 | 2.0 | 0.0 | 2.0 |
1 | 2 | 67 | 1 | 0 | 160.0 | 286.0 | False | 1.0 | 108.0 | True | 1.5 | 0.0 | 3.0 | 0.0 |
2 | 3 | 67 | 1 | 0 | 120.0 | 229.0 | False | 1.0 | 129.0 | True | 2.6 | 0.0 | 2.0 | 1.0 |
3 | 4 | 37 | 1 | 1 | 130.0 | 250.0 | False | 0.0 | 187.0 | False | 3.5 | 2.0 | 0.0 | 0.0 |
4 | 5 | 41 | 0 | 2 | 130.0 | 204.0 | False | 1.0 | 172.0 | False | 1.4 | 1.0 | 0.0 | 0.0 |
y.head()
0 0 1 1 2 1 3 0 4 0 Name: num, dtype: int64
minmax=preprocessing.MinMaxScaler(feature_range=(0,1))
minmax.fit(x).transform(x)
array([[0. , 0.71428571, 1. , ..., 1. , 0. , 1. ], [0.00108814, 0.79591837, 1. , ..., 0. , 1. , 0. ], [0.00217628, 0.79591837, 1. , ..., 0. , 0.66666667, 0.5 ], ..., [0.99782372, 0.55102041, 1. , ..., 0.26923077, 0.31188434, 1. ], [0.99891186, 0.6122449 , 1. , ..., 0.26923077, 0.31188434, 0.32718894], [1. , 0.69387755, 1. , ..., 0.26923077, 0.31188434, 0.32718894]])
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, random_state = 42, shuffle = True)
x_train.head()
id | age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
880 | 881 | 62 | 1 | 0 | 132.132404 | 170.000000 | False | 2.0 | 120.000000 | True | 3.000000 | 0.538462 | 0.935653 | 0.654378 |
457 | 458 | 54 | 1 | 1 | 150.000000 | 199.130337 | False | 0.0 | 122.000000 | False | 0.000000 | 0.538462 | 0.935653 | 0.654378 |
797 | 798 | 51 | 1 | 1 | 132.132404 | 339.000000 | False | 0.0 | 137.545665 | 0.389595 | 0.878788 | 0.538462 | 0.935653 | 0.654378 |
25 | 26 | 50 | 0 | 1 | 120.000000 | 219.000000 | False | 0.0 | 158.000000 | False | 1.600000 | 0.000000 | 0.000000 | 0.000000 |
84 | 85 | 52 | 1 | 2 | 120.000000 | 325.000000 | False | 0.0 | 172.000000 | False | 0.200000 | 1.000000 | 0.000000 | 0.000000 |
y_train.head()
880 1 457 0 797 1 25 0 84 0 Name: num, dtype: int64
#Fitting Classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(x_train,y_train)
KNeighborsClassifier()
#Predict on test data
y_pred=clf.predict(x_test)
print("Predicted Heart disease : ")
print(y_pred)
Predicted Heart disease : [0 0 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1]
print("Actual Heart disease : ")
print(y_test.values)
Actual Heart disease : [0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix
print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))
Accuracy score: 80.978261
print(confusion_matrix(y_test, y_pred))
[[63 12] [23 86]]
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_pred, y_test)
roc_auc = auc(fpr, tpr)
#plt.figure()
plt.plot(fpr, tpr, color='darkorange',
label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))
ROC score : 81.449541
from sklearn.ensemble import RandomForestClassifier
rmf = RandomForestClassifier(max_depth=3, random_state=42)
rmf_clf = rmf.fit(x_train, y_train)
#Predict on test data
y_pred=rmf.predict(x_test)
print("Predicted Heart disease : ")
print(y_pred)
Predicted Heart disease : [0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]
print("Actual Heart disease : ")
print(y_test.values)
Actual Heart disease : [0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]
print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))
print("Recall score : %f" %(recall_score(y_test, y_pred) * 100))
print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))
print(confusion_matrix(y_test, y_pred))
Accuracy score: 88.586957 Recall score : 90.825688 ROC score : 88.079511 [[64 11] [10 99]]
plt.figure(dpi=150)
plt.hist(probas, bins=20)
plt.title('Classification Probabilities')
plt.xlabel('Probability')
plt.ylabel('# of Instances')
plt.xlim([0.5, 1.0])
plt.legend(y_test)
plt.show()
from sklearn.naive_bayes import GaussianNB
nb_classifier =GaussianNB()
nb_classifier.fit(x_train, y_train)
GaussianNB()
y_pred=nb_classifier.predict(x_test)
print("Predicted Heart disease : ")
print(y_pred)
Predicted Heart disease : [0 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 0 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1]
print("Actual Heart disease : ")
print(y_test.values)
Actual Heart disease : [0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]
print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))
print("Recall score : %f" %(recall_score(y_test, y_pred) * 100))
print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))
print(confusion_matrix(y_test, y_pred))
Accuracy score: 81.521739 Recall score : 78.899083 ROC score : 82.116208 [[64 11] [23 86]]
pipe_knn = Pipeline([('scl', MinMaxScaler()),
('pca', PCA(n_components=2)),
('clf', KNeighborsClassifier())])
pipe_rf = Pipeline([('scl', MinMaxScaler()),
('pca', PCA(n_components=3)),
('clf', RandomForestClassifier(random_state=42))])
pipe_nb = Pipeline([('scl', MinMaxScaler()),
('pca', PCA(n_components=2)),
('clf', GaussianNB())])
pipelines = [pipe_knn, pipe_rf, pipe_nb]
pipelines[1]
Pipeline(steps=[('scl', MinMaxScaler()), ('pca', PCA(n_components=3)), ('clf', RandomForestClassifier(random_state=42))])
pipe_dict = {0: 'kNN Classifier', 1: 'Random Forest', 2: 'Navies Bayes'}
for pipe in pipelines:
pipe.fit(x_train, y_train)
for idx, val in enumerate(pipelines):
print('%s pipeline test accuracy: %.2f' %
(pipe_dict[idx], val.score(x_test, y_test)))
kNN Classifier pipeline test accuracy: 0.82 Random Forest pipeline test accuracy: 0.85 Navies Bayes pipeline test accuracy: 0.80
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, val in enumerate(pipelines):
if val.score(x_test, y_test) > best_acc:
best_acc = val.score(x_test, y_test)
best_pipe = val
best_clf = idx
print('Classifier with best accuracy: %s' % pipe_dict[best_clf])
Classifier with best accuracy: Random Forest