import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline


path = 'C:\\Users\\mahit\\OneDrive\\Desktop\\DSPP\\ML & DL\\Assignments\\JNTUH ML DL assignment 3\\'


df = pd.read_csv(path+'heart_disease_uci.csv')
df


df.head()


df.tail()


df = df.drop(['dataset'],axis = 1)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   cp        920 non-null    object 
 4   trestbps  861 non-null    float64
 5   chol      890 non-null    float64
 6   fbs       830 non-null    object 
 7   restecg   918 non-null    object 
 8   thalch    865 non-null    float64
 9   exang     865 non-null    object 
 10  oldpeak   858 non-null    float64
 11  slope     611 non-null    object 
 12  ca        309 non-null    float64
 13  thal      434 non-null    object 
 14  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(7)
memory usage: 107.9+ KB


df.isnull().sum()

id            0
age           0
sex           0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64


df.describe()


df['cp'].value_counts()

asymptomatic       496
non-anginal        204
atypical angina    174
typical angina      46
Name: cp, dtype: int64


fbs_1 = {'True':1, 'False':0}
df['fbs'] = df['fbs'].replace(fbs_1)

cp_1 = {'asymptomatic':0, 'non-anginal':1, 'atypical angina': 2, 'typical angina': 3}
df['cp'] = df['cp'].replace(cp_1)

exang_1 = {'True':1, 'False':0}
df['exang'] = df['exang'].replace(exang_1)

slope_1 = {'flat':0, 'upsloping':1, 'downsloping':2}
df['slope'] = df['slope'].replace(slope_1)

ca_1 = {'0.935653':4, '0.000000':0, '1.000000':1, '2.000000':2, '3.000000':3}
df['ca'] = df['ca'].replace(ca_1)

restecg_1 = {'normal':0, 'lv hypertrophy':1, 'st-t abnormality': 2}
df['restecg'] = df['restecg'].replace(restecg_1)

thal_1 = {'normal':0, 'reversable defect':1, 'fixed defect': 2}
df['thal'] = df['thal'].replace(thal_1)

sex_1 = {'Male':1, 'Female':0}
df['sex'] = df['sex'].replace(sex_1)


df['num'].replace([0,1,2,3,4],['absent','present','present','present','present'], inplace = True)


df['num'].value_counts(normalize = True)

present    0.553261
absent     0.446739
Name: num, dtype: float64


df['num'].replace(['present','absent'],[1,0], inplace = True)


df['num'].value_counts(normalize = True)

1    0.553261
0    0.446739
Name: num, dtype: float64


# Numerical
df['trestbps'].fillna(df['trestbps'].mean(),inplace = True)
df['chol'].fillna(df['chol'].mean(),inplace = True)
df['thalch'].fillna(df['thalch'].mean(),inplace = True)
df['oldpeak'].fillna(df['oldpeak'].mean(),inplace = True)
df['ca'].fillna(df['ca'].std(),inplace = True)

# Categorical
df['exang'].fillna(df['exang'].mean(),inplace = True)
df['fbs'].fillna(df['fbs'].mean(),inplace = True)
df['slope'].fillna(df['slope'].mean(),inplace = True)
df['restecg'].fillna(df['restecg'].mean(),inplace = True)
df['thal'].fillna(df['thal'].mean(),inplace = True)


df.isnull().sum()

id          0
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    int64  
 3   cp        920 non-null    int64  
 4   trestbps  920 non-null    float64
 5   chol      920 non-null    float64
 6   fbs       920 non-null    object 
 7   restecg   920 non-null    float64
 8   thalch    920 non-null    float64
 9   exang     920 non-null    object 
 10  oldpeak   920 non-null    float64
 11  slope     920 non-null    float64
 12  ca        920 non-null    float64
 13  thal      920 non-null    float64
 14  num       920 non-null    int64  
dtypes: float64(8), int64(5), object(2)
memory usage: 107.9+ KB


df.describe()


sns.displot(df['sex'])

<seaborn.axisgrid.FacetGrid at 0x2062e4e3a60>


sns.displot(df['cp'])

<seaborn.axisgrid.FacetGrid at 0x20639f05580>


sns.displot(df['age'])

<seaborn.axisgrid.FacetGrid at 0x206354b7220>


sns.displot(df['thal'])

<seaborn.axisgrid.FacetGrid at 0x20639f2f850>


sns.displot(df['restecg'])

<seaborn.axisgrid.FacetGrid at 0x2063af85e20>


sns.displot(df['slope'])

<seaborn.axisgrid.FacetGrid at 0x2063b0b6d90>


df.hist(column = "num", by = "sex", bins = 10)

array([<AxesSubplot:title={'center':'0'}>,
       <AxesSubplot:title={'center':'1'}>], dtype=object)


plt.figure(figsize=(15,15))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)

<AxesSubplot:>


x = df.iloc[:,0:14]
y = df.iloc[:,-1]


x.head()


y.head()

0    0
1    1
2    1
3    0
4    0
Name: num, dtype: int64


minmax=preprocessing.MinMaxScaler(feature_range=(0,1))
minmax.fit(x).transform(x)

array([[0.        , 0.71428571, 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.00108814, 0.79591837, 1.        , ..., 0.        , 1.        ,
        0.        ],
       [0.00217628, 0.79591837, 1.        , ..., 0.        , 0.66666667,
        0.5       ],
       ...,
       [0.99782372, 0.55102041, 1.        , ..., 0.26923077, 0.31188434,
        1.        ],
       [0.99891186, 0.6122449 , 1.        , ..., 0.26923077, 0.31188434,
        0.32718894],
       [1.        , 0.69387755, 1.        , ..., 0.26923077, 0.31188434,
        0.32718894]])


x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, random_state = 42, shuffle = True)


x_train.head()


y_train.head()

880    1
457    0
797    1
25     0
84     0
Name: num, dtype: int64


#Fitting Classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(x_train,y_train)

KNeighborsClassifier()


#Predict on test data
y_pred=clf.predict(x_test)


print("Predicted Heart disease : ")
print(y_pred)

Predicted Heart disease : 
[0 0 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0
 1 1 1 1 0 1 1 0 1 1 0 0 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0 0
 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 1 0 0 1
 0 0 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 1 1 1 1 1 0
 0 1 1 1 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1]


print("Actual Heart disease : ")
print(y_test.values)

Actual Heart disease : 
[0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0
 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0
 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1
 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1
 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]


from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix

print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))

Accuracy score: 80.978261


print(confusion_matrix(y_test, y_pred))

[[63 12]
 [23 86]]


import matplotlib.pyplot as plt  
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_pred, y_test)
roc_auc = auc(fpr, tpr)

#plt.figure()
plt.plot(fpr, tpr, color='darkorange', 
         label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy',  linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()


print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))

ROC score : 81.449541


from sklearn.ensemble import RandomForestClassifier
rmf = RandomForestClassifier(max_depth=3, random_state=42)
rmf_clf = rmf.fit(x_train, y_train)


#Predict on test data
y_pred=rmf.predict(x_test)


print("Predicted Heart disease : ")
print(y_pred)

Predicted Heart disease : 
[0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 0 1 0 1 0
 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 0
 0 0 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 0 0 1 1 1 1 0 0 1 1 1 0 0
 1 1 0 1 0 1 0 1 0 0 1 0 1 1 1 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]


print("Actual Heart disease : ")
print(y_test.values)

Actual Heart disease : 
[0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0
 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0
 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1
 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1
 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]


print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))
print("Recall score : %f" %(recall_score(y_test, y_pred) * 100))
print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))
print(confusion_matrix(y_test, y_pred))

Accuracy score: 88.586957
Recall score : 90.825688
ROC score : 88.079511

[[64 11]
 [10 99]]


plt.figure(dpi=150)
plt.hist(probas, bins=20)
plt.title('Classification Probabilities')
plt.xlabel('Probability')
plt.ylabel('# of Instances')
plt.xlim([0.5, 1.0])
plt.legend(y_test)
plt.show()


from sklearn.naive_bayes import GaussianNB
nb_classifier =GaussianNB()
nb_classifier.fit(x_train, y_train)

GaussianNB()


y_pred=nb_classifier.predict(x_test)


print("Predicted Heart disease : ")
print(y_pred)

Predicted Heart disease : 
[0 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 1 0
 1 0 1 1 0 1 0 0 1 1 1 0 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 0
 0 0 0 1 0 0 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0
 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 0 0 0 0 1 1 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1]


print("Actual Heart disease : ")
print(y_test.values)

Actual Heart disease : 
[0 0 1 1 1 0 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0
 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 0 0 1 1 1 1 0
 1 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 1
 1 1 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1
 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 1 1 0 1 1 0 1]


print("\nAccuracy score: %f" %(accuracy_score(y_test,y_pred) * 100))
print("Recall score : %f" %(recall_score(y_test, y_pred) * 100))
print("ROC score : %f\n" %(roc_auc_score(y_test, y_pred) * 100))
print(confusion_matrix(y_test, y_pred))

Accuracy score: 81.521739
Recall score : 78.899083
ROC score : 82.116208

[[64 11]
 [23 86]]


pipe_knn = Pipeline([('scl', MinMaxScaler()), 
                    ('pca', PCA(n_components=2)), 
                    ('clf', KNeighborsClassifier())])


pipe_rf = Pipeline([('scl', MinMaxScaler()), 
                    ('pca', PCA(n_components=3)), 
                    ('clf', RandomForestClassifier(random_state=42))])


pipe_nb = Pipeline([('scl', MinMaxScaler()), 
                    ('pca', PCA(n_components=2)), 
                    ('clf', GaussianNB())])


pipelines = [pipe_knn, pipe_rf, pipe_nb]

pipelines[1]

Pipeline(steps=[('scl', MinMaxScaler()), ('pca', PCA(n_components=3)),
                ('clf', RandomForestClassifier(random_state=42))])


pipe_dict = {0: 'kNN Classifier', 1: 'Random Forest', 2: 'Navies Bayes'}


for pipe in pipelines:
	pipe.fit(x_train, y_train)


for idx, val in enumerate(pipelines):
	print('%s pipeline test accuracy: %.2f' % 
          (pipe_dict[idx], val.score(x_test, y_test)))

kNN Classifier pipeline test accuracy: 0.82
Random Forest pipeline test accuracy: 0.85
Navies Bayes pipeline test accuracy: 0.80


best_acc = 0.0
best_clf = 0
best_pipe = ''

for idx, val in enumerate(pipelines):
	if val.score(x_test, y_test) > best_acc:
		best_acc = val.score(x_test, y_test)
		best_pipe = val
		best_clf = idx

print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

Classifier with best accuracy: Random Forest

	id	age	trestbps	chol	thalch	oldpeak	ca	num
count	920.000000	920.000000	861.000000	890.000000	865.000000	858.000000	309.000000	920.000000
mean	460.500000	53.510870	132.132404	199.130337	137.545665	0.878788	0.676375	0.995652
std	265.725422	9.424685	19.066070	110.780810	25.926276	1.091226	0.935653	1.142693
min	1.000000	28.000000	0.000000	0.000000	60.000000	-2.600000	0.000000	0.000000
25%	230.750000	47.000000	120.000000	175.000000	120.000000	0.000000	0.000000	0.000000
50%	460.500000	54.000000	130.000000	223.000000	140.000000	0.500000	0.000000	1.000000
75%	690.250000	60.000000	140.000000	268.000000	157.000000	1.500000	1.000000	2.000000
max	920.000000	77.000000	200.000000	603.000000	202.000000	6.200000	3.000000	4.000000

	id	age	sex	cp	trestbps	chol	restecg	thalch	oldpeak	slope	ca	thal	num
count	920.000000	920.000000	920.000000	920.000000	920.000000	920.000000	920.000000	920.000000	920.000000	920.000000	920.000000	920.000000	920.000000
mean	460.500000	53.510870	0.789130	0.750000	132.132404	199.130337	0.594771	137.545665	0.878788	0.538462	0.848570	0.654378	0.553261
std	265.725422	9.424685	0.408148	0.930969	18.443895	108.957634	0.793921	25.138494	1.053774	0.549850	0.555351	0.454881	0.497426
min	1.000000	28.000000	0.000000	0.000000	0.000000	0.000000	0.000000	60.000000	-2.600000	0.000000	0.000000	0.000000	0.000000
25%	230.750000	47.000000	1.000000	0.000000	120.000000	177.750000	0.000000	120.000000	0.000000	0.000000	0.935653	0.654378	0.000000
50%	460.500000	54.000000	1.000000	0.000000	130.000000	221.000000	0.000000	138.000000	0.800000	0.538462	0.935653	0.654378	1.000000
75%	690.250000	60.000000	1.000000	1.000000	140.000000	267.000000	1.000000	156.000000	1.500000	1.000000	0.935653	1.000000	1.000000
max	920.000000	77.000000	1.000000	3.000000	200.000000	603.000000	2.000000	202.000000	6.200000	2.000000	3.000000	2.000000	1.000000

Heart Disease Data¶

Data Cleaning¶

Data Imputation¶

Data Analysis¶

Model Building¶

kNN Classifier¶

Random Forest Classifier¶

Navies Bayes Classification¶

Pipelining¶

	id	age	sex	dataset	cp	trestbps	chol	fbs	restecg	thalch	exang	oldpeak	slope	ca	thal	num
0	1	63	Male	Cleveland	typical angina	145.0	233.0	True	lv hypertrophy	150.0	False	2.3	downsloping	0.0	fixed defect	0
1	2	67	Male	Cleveland	asymptomatic	160.0	286.0	False	lv hypertrophy	108.0	True	1.5	flat	3.0	normal	2
2	3	67	Male	Cleveland	asymptomatic	120.0	229.0	False	lv hypertrophy	129.0	True	2.6	flat	2.0	reversable defect	1
3	4	37	Male	Cleveland	non-anginal	130.0	250.0	False	normal	187.0	False	3.5	downsloping	0.0	normal	0
4	5	41	Female	Cleveland	atypical angina	130.0	204.0	False	lv hypertrophy	172.0	False	1.4	upsloping	0.0	normal	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
915	916	54	Female	VA Long Beach	asymptomatic	127.0	333.0	True	st-t abnormality	154.0	False	0.0	NaN	NaN	NaN	1
916	917	62	Male	VA Long Beach	typical angina	NaN	139.0	False	st-t abnormality	NaN	NaN	NaN	NaN	NaN	NaN	0
917	918	55	Male	VA Long Beach	asymptomatic	122.0	223.0	True	st-t abnormality	100.0	False	0.0	NaN	NaN	fixed defect	2
918	919	58	Male	VA Long Beach	asymptomatic	NaN	385.0	True	lv hypertrophy	NaN	NaN	NaN	NaN	NaN	NaN	0
919	920	62	Male	VA Long Beach	atypical angina	120.0	254.0	False	lv hypertrophy	93.0	True	0.0	NaN	NaN	NaN	1

	id	age	sex	cp	trestbps	chol	fbs	restecg	thalch	exang	oldpeak	slope	ca	thal
880	881	62	1	0	132.132404	170.000000	False	2.0	120.000000	True	3.000000	0.538462	0.935653	0.654378
457	458	54	1	1	150.000000	199.130337	False	0.0	122.000000	False	0.000000	0.538462	0.935653	0.654378
797	798	51	1	1	132.132404	339.000000	False	0.0	137.545665	0.389595	0.878788	0.538462	0.935653	0.654378
25	26	50	0	1	120.000000	219.000000	False	0.0	158.000000	False	1.600000	0.000000	0.000000	0.000000
84	85	52	1	2	120.000000	325.000000	False	0.0	172.000000	False	0.200000	1.000000	0.000000	0.000000