In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
In [2]:
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, explained_variance_score
In [3]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")
In [4]:
df=pd.read_csv(r"C:\Users\Ismail\Desktop\heart_disease_uci.csv")
In [5]:
print(df.columns)
Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')
In [6]:
df
Out[6]:
id age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
0 1 63 Male Cleveland typical angina 145.0 233.0 True lv hypertrophy 150.0 False 2.3 downsloping 0.0 fixed defect 0
1 2 67 Male Cleveland asymptomatic 160.0 286.0 False lv hypertrophy 108.0 True 1.5 flat 3.0 normal 2
2 3 67 Male Cleveland asymptomatic 120.0 229.0 False lv hypertrophy 129.0 True 2.6 flat 2.0 reversable defect 1
3 4 37 Male Cleveland non-anginal 130.0 250.0 False normal 187.0 False 3.5 downsloping 0.0 normal 0
4 5 41 Female Cleveland atypical angina 130.0 204.0 False lv hypertrophy 172.0 False 1.4 upsloping 0.0 normal 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
915 916 54 Female VA Long Beach asymptomatic 127.0 333.0 True st-t abnormality 154.0 False 0.0 NaN NaN NaN 1
916 917 62 Male VA Long Beach typical angina NaN 139.0 False st-t abnormality NaN NaN NaN NaN NaN NaN 0
917 918 55 Male VA Long Beach asymptomatic 122.0 223.0 True st-t abnormality 100.0 False 0.0 NaN NaN fixed defect 2
918 919 58 Male VA Long Beach asymptomatic NaN 385.0 True lv hypertrophy NaN NaN NaN NaN NaN NaN 0
919 920 62 Male VA Long Beach atypical angina 120.0 254.0 False lv hypertrophy 93.0 True 0.0 NaN NaN NaN 1

920 rows × 16 columns

In [8]:
df.head(5)
Out[8]:
id age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
0 1 63 Male Cleveland typical angina 145.0 233.0 True lv hypertrophy 150.0 False 2.3 downsloping 0.0 fixed defect 0
1 2 67 Male Cleveland asymptomatic 160.0 286.0 False lv hypertrophy 108.0 True 1.5 flat 3.0 normal 2
2 3 67 Male Cleveland asymptomatic 120.0 229.0 False lv hypertrophy 129.0 True 2.6 flat 2.0 reversable defect 1
3 4 37 Male Cleveland non-anginal 130.0 250.0 False normal 187.0 False 3.5 downsloping 0.0 normal 0
4 5 41 Female Cleveland atypical angina 130.0 204.0 False lv hypertrophy 172.0 False 1.4 upsloping 0.0 normal 0
In [9]:
df.describe
Out[9]:
<bound method NDFrame.describe of       id  age     sex        dataset               cp  trestbps   chol    fbs  \
0      1   63    Male      Cleveland   typical angina     145.0  233.0   True   
1      2   67    Male      Cleveland     asymptomatic     160.0  286.0  False   
2      3   67    Male      Cleveland     asymptomatic     120.0  229.0  False   
3      4   37    Male      Cleveland      non-anginal     130.0  250.0  False   
4      5   41  Female      Cleveland  atypical angina     130.0  204.0  False   
..   ...  ...     ...            ...              ...       ...    ...    ...   
915  916   54  Female  VA Long Beach     asymptomatic     127.0  333.0   True   
916  917   62    Male  VA Long Beach   typical angina       NaN  139.0  False   
917  918   55    Male  VA Long Beach     asymptomatic     122.0  223.0   True   
918  919   58    Male  VA Long Beach     asymptomatic       NaN  385.0   True   
919  920   62    Male  VA Long Beach  atypical angina     120.0  254.0  False   

              restecg  thalch  exang  oldpeak        slope   ca  \
0      lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1      lv hypertrophy   108.0   True      1.5         flat  3.0   
2      lv hypertrophy   129.0   True      2.6         flat  2.0   
3              normal   187.0  False      3.5  downsloping  0.0   
4      lv hypertrophy   172.0  False      1.4    upsloping  0.0   
..                ...     ...    ...      ...          ...  ...   
915  st-t abnormality   154.0  False      0.0          NaN  NaN   
916  st-t abnormality     NaN    NaN      NaN          NaN  NaN   
917  st-t abnormality   100.0  False      0.0          NaN  NaN   
918    lv hypertrophy     NaN    NaN      NaN          NaN  NaN   
919    lv hypertrophy    93.0   True      0.0          NaN  NaN   

                  thal  num  
0         fixed defect    0  
1               normal    2  
2    reversable defect    1  
3               normal    0  
4               normal    0  
..                 ...  ...  
915                NaN    1  
916                NaN    0  
917       fixed defect    2  
918                NaN    0  
919                NaN    1  

[920 rows x 16 columns]>
In [15]:
df_n=df[(df.dataset=='Cleveland')]
df_n
Out[15]:
id age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
0 1 63 Male Cleveland typical angina 145.0 233.0 True lv hypertrophy 150.0 False 2.3 downsloping 0.0 fixed defect 0
1 2 67 Male Cleveland asymptomatic 160.0 286.0 False lv hypertrophy 108.0 True 1.5 flat 3.0 normal 2
2 3 67 Male Cleveland asymptomatic 120.0 229.0 False lv hypertrophy 129.0 True 2.6 flat 2.0 reversable defect 1
3 4 37 Male Cleveland non-anginal 130.0 250.0 False normal 187.0 False 3.5 downsloping 0.0 normal 0
4 5 41 Female Cleveland atypical angina 130.0 204.0 False lv hypertrophy 172.0 False 1.4 upsloping 0.0 normal 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
299 300 68 Male Cleveland asymptomatic 144.0 193.0 True normal 141.0 False 3.4 flat 2.0 reversable defect 2
300 301 57 Male Cleveland asymptomatic 130.0 131.0 False normal 115.0 True 1.2 flat 1.0 reversable defect 3
301 302 57 Female Cleveland atypical angina 130.0 236.0 False lv hypertrophy 174.0 False 0.0 flat 1.0 normal 1
302 303 38 Male Cleveland non-anginal 138.0 175.0 False normal 173.0 False 0.0 upsloping NaN normal 0
303 304 28 Male Cleveland atypical angina 130.0 132.0 False lv hypertrophy 185.0 False 0.0 NaN NaN NaN 0

304 rows × 16 columns

In [17]:
df_Male= df_n[(df_n.sex=='Male')]
df_Male
Out[17]:
id age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
0 1 63 Male Cleveland typical angina 145.0 233.0 True lv hypertrophy 150.0 False 2.3 downsloping 0.0 fixed defect 0
1 2 67 Male Cleveland asymptomatic 160.0 286.0 False lv hypertrophy 108.0 True 1.5 flat 3.0 normal 2
2 3 67 Male Cleveland asymptomatic 120.0 229.0 False lv hypertrophy 129.0 True 2.6 flat 2.0 reversable defect 1
3 4 37 Male Cleveland non-anginal 130.0 250.0 False normal 187.0 False 3.5 downsloping 0.0 normal 0
5 6 56 Male Cleveland atypical angina 120.0 236.0 False normal 178.0 False 0.8 upsloping 0.0 normal 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 299 45 Male Cleveland typical angina 110.0 264.0 False normal 132.0 False 1.2 flat 0.0 reversable defect 1
299 300 68 Male Cleveland asymptomatic 144.0 193.0 True normal 141.0 False 3.4 flat 2.0 reversable defect 2
300 301 57 Male Cleveland asymptomatic 130.0 131.0 False normal 115.0 True 1.2 flat 1.0 reversable defect 3
302 303 38 Male Cleveland non-anginal 138.0 175.0 False normal 173.0 False 0.0 upsloping NaN normal 0
303 304 28 Male Cleveland atypical angina 130.0 132.0 False lv hypertrophy 185.0 False 0.0 NaN NaN NaN 0

207 rows × 16 columns

In [18]:
df_Female= df_n[(df_n.sex=='Female')]
df_Female
Out[18]:
id age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
4 5 41 Female Cleveland atypical angina 130.0 204.0 False lv hypertrophy 172.0 False 1.4 upsloping 0.0 normal 0
6 7 62 Female Cleveland asymptomatic 140.0 268.0 False lv hypertrophy 160.0 False 3.6 downsloping 2.0 normal 3
7 8 57 Female Cleveland asymptomatic 120.0 354.0 False normal 163.0 True 0.6 upsloping 0.0 normal 0
11 12 56 Female Cleveland atypical angina 140.0 294.0 False lv hypertrophy 153.0 False 1.3 flat 0.0 normal 0
18 19 48 Female Cleveland non-anginal 130.0 275.0 False normal 139.0 False 0.2 upsloping 0.0 normal 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
286 287 58 Female Cleveland asymptomatic 170.0 225.0 True lv hypertrophy 146.0 True 2.8 flat 2.0 fixed defect 2
291 292 55 Female Cleveland atypical angina 132.0 342.0 False normal 166.0 False 1.2 upsloping 0.0 normal 0
294 295 63 Female Cleveland asymptomatic 124.0 197.0 False normal 136.0 True 0.0 flat 0.0 normal 1
297 298 57 Female Cleveland asymptomatic 140.0 241.0 False normal 123.0 True 0.2 flat 0.0 reversable defect 1
301 302 57 Female Cleveland atypical angina 130.0 236.0 False lv hypertrophy 174.0 False 0.0 flat 1.0 normal 1

97 rows × 16 columns

In [21]:
data=df_n
data = data[(data['chol'] <= 420) & (data['oldpeak'] >=0) & (data['oldpeak'] <=4)].reset_index(drop=True)
data = data.dropna().reset_index(drop=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        291 non-null    int64  
 1   age       291 non-null    int64  
 2   sex       291 non-null    object 
 3   dataset   291 non-null    object 
 4   cp        291 non-null    object 
 5   trestbps  291 non-null    float64
 6   chol      291 non-null    float64
 7   fbs       291 non-null    object 
 8   restecg   291 non-null    object 
 9   thalch    291 non-null    float64
 10  exang     291 non-null    object 
 11  oldpeak   291 non-null    float64
 12  slope     291 non-null    object 
 13  ca        291 non-null    float64
 14  thal      291 non-null    object 
 15  num       291 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 36.5+ KB
In [22]:
data.describe()
Out[22]:
id age trestbps chol thalch oldpeak ca num
count 291.000000 291.000000 291.000000 291.000000 291.000000 291.000000 291.000000 291.000000
mean 150.237113 54.457045 131.498282 246.158076 149.855670 0.987285 0.659794 0.920962
std 87.725250 9.094184 17.587883 48.554652 22.989948 1.055009 0.915987 1.213689
min 1.000000 29.000000 94.000000 126.000000 71.000000 0.000000 0.000000 0.000000
25% 73.500000 47.500000 120.000000 211.000000 133.500000 0.000000 0.000000 0.000000
50% 149.000000 56.000000 130.000000 242.000000 153.000000 0.800000 0.000000 0.000000
75% 226.500000 61.000000 140.000000 275.000000 166.000000 1.600000 1.000000 2.000000
max 302.000000 77.000000 200.000000 417.000000 202.000000 4.000000 3.000000 4.000000
In [25]:
CATEGORICAL_COLS = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']
NUMERICAL_COLS = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']

heart_cat =data[CATEGORICAL_COLS]
heart_num = data[NUMERICAL_COLS]

heart_cat.nunique()
Out[25]:
sex        2
cp         4
fbs        2
restecg    3
exang      2
slope      3
thal       3
ca         4
dtype: int64
In [26]:
fig, axes = plt.subplots(2, 4, figsize=(20,10))

sns.countplot(x='sex', data=heart_cat, ax=axes[0,0])
axes[0,0].set_title('Gender Distribution')

sns.countplot(x='cp', data=heart_cat, ax=axes[0,1])
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].set_title('Chest Pain Types')

sns.countplot(x='fbs', data=heart_cat, ax=axes[0,2])
axes[0,2].set_title('Fasting Blood Sugar > 120 mg/dl')

sns.countplot(x='restecg', data=heart_cat, ax=axes[0,3])
axes[0,3].set_title('Resting Electrocardiographic Results')

sns.countplot(x='exang', data=heart_cat, ax=axes[1,0])
axes[1,0].set_title('Exercise Induced Angina')

sns.countplot(x='slope', data=heart_cat, ax=axes[1,1])
axes[1,1].set_title('Slope of the Peak Exercise ST Segment')

sns.countplot(x='thal', data=heart_cat, ax=axes[1,2])
axes[1,2].set_title('Defects')

sns.countplot(x='ca', data=heart_cat, ax=axes[1,3])
axes[1,3].set_title('Number of Major Vessels colored by Fluoroscopy')
plt.tight_layout()
plt.show()
In [27]:
fig, axes = plt.subplots(2, 2, figsize=(10,10))

heart_num.plot('age', 'chol', kind='scatter', ax=axes[0,0])
axes[0,0].set_title('Age Against Cholesterol Levels')

heart_num.plot('age', 'trestbps', kind='scatter', ax=axes[0,1])
axes[0,1].set_title('Age Against Resting Blood Pressure')

heart_num.plot('age', 'thalch', kind='scatter', ax=axes[1,0])
axes[1,0].set_title('Age Against Maximum Heart Rate Achieved')

heart_num.plot('age', 'oldpeak', kind='scatter', ax=axes[1,1])
axes[1,1].set_title('Age Against ST Depression')

plt.tight_layout()
plt.show()
In [29]:
fig, axes = plt.subplots(3, figsize=(7,10))

sns.scatterplot(x='chol', y='thalch', hue='num', data=data, ax=axes[0])
axes[0].set_title('Affect of Cholesterol on Maximum Heart Rate')

sns.scatterplot(x='chol', y='thalch', hue='sex', data=data, ax=axes[1])

sns.scatterplot(x='chol', y='thalch', hue='restecg', data=data, ax=axes[2])
plt.show()
In [32]:
sns.scatterplot(x='trestbps', y='thalch', hue='restecg', data=data)
plt.show()
In [35]:
fig, axes = plt.subplots(3, figsize=(7,10))

axes[0].set_title('Affect of Cholesterol on Resting Blood Pressure')
sns.scatterplot(x='chol', y='trestbps', hue='num', data=data, ax=axes[0])
sns.scatterplot(x='chol', y='trestbps', hue='sex', data=data, ax=axes[1])
sns.scatterplot(x='chol', y='trestbps', hue='restecg', data=data, ax=axes[2])

plt.tight_layout()
plt.show()
In [36]:
data.groupby('num').mean()
Out[36]:
id age trestbps chol thalch oldpeak ca
num
0 146.208861 52.512658 128.955696 241.297468 158.658228 0.569620 0.278481
1 152.185185 55.611111 133.277778 249.148148 145.981481 1.022222 0.740741
2 169.371429 58.200000 134.371429 260.857143 135.000000 1.802857 1.257143
3 154.312500 56.000000 134.406250 248.343750 132.625000 1.646875 1.406250
4 127.833333 59.833333 140.833333 248.000000 140.666667 2.191667 1.583333
In [38]:
print('Average Cholesterol Level Based on Target Variable and Chest Pain Type')
print(pd.crosstab(index=data.num, columns=data.cp, values=data.chol, aggfunc=np.mean))
print('\n')

print('Average Cholesterol Level Based on Target Variable and Patient Gender')
print(pd.crosstab(index=data.num, columns=data.sex, values=data.chol, aggfunc=np.mean))
print('\n')

print('Average Cholesterol Level Based on Target Variable and Cardiographic Results')
print(pd.crosstab(index=data.num, columns=data.restecg, values=data.chol, aggfunc=np.mean))
Average Cholesterol Level Based on Target Variable and Chest Pain Type
cp   asymptomatic  atypical angina  non-anginal  typical angina
num                                                            
0      245.333333       241.200000   242.203125           227.2
1      249.764706       254.166667   236.222222           262.2
2      264.103448       246.000000   247.750000           234.0
3      245.846154       300.000000   238.750000             NaN
4      251.500000              NaN   230.000000           231.0


Average Cholesterol Level Based on Target Variable and Patient Gender
sex      Female        Male
num                        
0    252.942857  232.034091
1    264.222222  246.133333
2    309.142857  248.785714
3    261.000000  245.423077
4    316.000000  234.400000


Average Cholesterol Level Based on Target Variable and Cardiographic Results
restecg  lv hypertrophy      normal  st-t abnormality
num                                                  
0            250.661538  235.163043             197.0
1            249.290323  248.956522               NaN
2            274.437500  245.111111             327.0
3            255.571429  237.500000             205.0
4            257.100000  202.500000               NaN
In [39]:
corr = data.corr()
print(corr)

sns.heatmap(corr)
plt.show()
                id       age  trestbps      chol    thalch   oldpeak  \
id        1.000000  0.009105 -0.017465 -0.106741 -0.116757 -0.138977   
age       0.009105  1.000000  0.295220  0.193051 -0.401816  0.198531   
trestbps -0.017465  0.295220  1.000000  0.175936 -0.042743  0.163879   
chol     -0.106741  0.193051  0.175936  1.000000 -0.008048  0.040159   
thalch   -0.116757 -0.401816 -0.042743 -0.008048  1.000000 -0.344198   
oldpeak  -0.138977  0.198531  0.163879  0.040159 -0.344198  1.000000   
ca        0.039114  0.374728  0.099386  0.138833 -0.268102  0.262771   
num       0.024629  0.228222  0.165492  0.087107 -0.413299  0.485032   

                ca       num  
id        0.039114  0.024629  
age       0.374728  0.228222  
trestbps  0.099386  0.165492  
chol      0.138833  0.087107  
thalch   -0.268102 -0.413299  
oldpeak   0.262771  0.485032  
ca        1.000000  0.503025  
num       0.503025  1.000000  
In [40]:
data.boxplot()
plt.show()
In [ ]: