import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
heart_data = pd.read_csv('heart_disease_uci.csv')
heart_data.head()
id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
2 | 3 | 67 | Male | Cleveland | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | Cleveland | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | Cleveland | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
heart_data.drop(['id','dataset'], axis=1, inplace=True)
heart_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 920 entries, 0 to 919 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 920 non-null int64 1 sex 920 non-null object 2 cp 920 non-null object 3 trestbps 861 non-null float64 4 chol 890 non-null float64 5 fbs 830 non-null object 6 restecg 918 non-null object 7 thalch 865 non-null float64 8 exang 865 non-null object 9 oldpeak 858 non-null float64 10 slope 611 non-null object 11 ca 309 non-null float64 12 thal 434 non-null object 13 num 920 non-null int64 dtypes: float64(5), int64(2), object(7) memory usage: 100.8+ KB
heart_data.describe()
age | trestbps | chol | thalch | oldpeak | ca | num | |
---|---|---|---|---|---|---|---|
count | 920.000000 | 861.000000 | 890.000000 | 865.000000 | 858.000000 | 309.000000 | 920.000000 |
mean | 53.510870 | 132.132404 | 199.130337 | 137.545665 | 0.878788 | 0.676375 | 0.995652 |
std | 9.424685 | 19.066070 | 110.780810 | 25.926276 | 1.091226 | 0.935653 | 1.142693 |
min | 28.000000 | 0.000000 | 0.000000 | 60.000000 | -2.600000 | 0.000000 | 0.000000 |
25% | 47.000000 | 120.000000 | 175.000000 | 120.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 54.000000 | 130.000000 | 223.000000 | 140.000000 | 0.500000 | 0.000000 | 1.000000 |
75% | 60.000000 | 140.000000 | 268.000000 | 157.000000 | 1.500000 | 1.000000 | 2.000000 |
max | 77.000000 | 200.000000 | 603.000000 | 202.000000 | 6.200000 | 3.000000 | 4.000000 |
# separateing numeric and categoriecal variables for visualization purpose
CATEGORICAL_COLS = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']
NUMERICAL_COLS = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
heart_cat = heart_data[CATEGORICAL_COLS]
heart_num = heart_data[NUMERICAL_COLS]
heart_cat.nunique()
sex 2 cp 4 fbs 2 restecg 3 exang 2 slope 3 thal 3 ca 4 dtype: int64
fig, axes = plt.subplots(2, 4, figsize=(20,10))
sns.countplot(x='sex', data=heart_cat, ax=axes[0,0])
axes[0,0].set_title('Gender Distribution')
sns.countplot(x='cp', data=heart_cat, ax=axes[0,1])
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].set_title('Chest Pain Types')
sns.countplot(x='fbs', data=heart_cat, ax=axes[0,2])
axes[0,2].set_title('Fasting Blood Sugar > 120 mg/dl')
sns.countplot(x='restecg', data=heart_cat, ax=axes[0,3])
axes[0,3].set_title('Resting Electrocardiographic Results')
sns.countplot(x='exang', data=heart_cat, ax=axes[1,0])
axes[1,0].set_title('Exercise Induced Angina')
sns.countplot(x='slope', data=heart_cat, ax=axes[1,1])
axes[1,1].set_title('Slope of the Peak Exercise ST Segment')
sns.countplot(x='thal', data=heart_cat, ax=axes[1,2])
axes[1,2].set_title('Defects')
sns.countplot(x='ca', data=heart_cat, ax=axes[1,3])
axes[1,3].set_title('Number of Major Vessels colored by Fluoroscopy')
plt.tight_layout()
plt.show()
# use scatterplots to visualize key relationships in numerical data
fig, axes = plt.subplots(2, 2, figsize=(10,10))
heart_num.plot('age', 'chol', kind='scatter', ax=axes[0,0])
axes[0,0].set_title('Age Against Cholesterol Levels')
heart_num.plot('age', 'trestbps', kind='scatter', ax=axes[0,1])
axes[0,1].set_title('Age Against Resting Blood Pressure')
heart_num.plot('age', 'thalch', kind='scatter', ax=axes[1,0])
axes[1,0].set_title('Age Against Maximum Heart Rate Achieved')
heart_num.plot('age', 'oldpeak', kind='scatter', ax=axes[1,1])
axes[1,1].set_title('Age Against ST Depression')
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(3, figsize=(7,10))
sns.scatterplot(x='chol', y='thalch', hue='num', data=heart_data, ax=axes[0])
axes[0].set_title('Affect of Cholesterol on Maximum Heart Rate')
sns.scatterplot(x='chol', y='thalch', hue='sex', data=heart_data, ax=axes[1])
sns.scatterplot(x='chol', y='thalch', hue='restecg', data=heart_data, ax=axes[2])
plt.show()
sns.scatterplot(x='trestbps', y='thalch', hue='restecg', data=heart_data)
plt.show()
fig, axes = plt.subplots(3, figsize=(7,10))
axes[0].set_title('Affect of Cholesterol on Resting Blood Pressure')
sns.scatterplot(x='chol', y='trestbps', hue='num', data=heart_data, ax=axes[0])
sns.scatterplot(x='chol', y='trestbps', hue='sex', data=heart_data, ax=axes[1])
sns.scatterplot(x='chol', y='trestbps', hue='restecg', data=heart_data, ax=axes[2])
plt.tight_layout()
plt.show()
heart_data.groupby('num').mean()
age | trestbps | chol | thalch | oldpeak | ca | |
---|---|---|---|---|---|---|
num | ||||||
0 | 50.547445 | 129.913043 | 227.905612 | 148.800512 | 0.418205 | 0.278788 |
1 | 53.528302 | 132.861111 | 195.255814 | 131.035714 | 1.001200 | 0.741379 |
2 | 57.577982 | 133.613861 | 143.859813 | 128.666667 | 1.353465 | 1.222222 |
3 | 59.214953 | 136.152174 | 159.716981 | 120.500000 | 1.581319 | 1.459459 |
4 | 59.214286 | 138.720000 | 192.148148 | 127.846154 | 2.307692 | 1.692308 |
print('Average Cholesterol Level Based on Target Variable and Chest Pain Type')
print(pd.crosstab(index=heart_data.num, columns=heart_data.cp, values=heart_data.chol, aggfunc=np.mean))
print('\n')
print('Average Cholesterol Level Based on Target Variable and Patient Gender')
print(pd.crosstab(index=heart_data.num, columns=heart_data.sex, values=heart_data.chol, aggfunc=np.mean))
print('\n')
print('Average Cholesterol Level Based on Target Variable and Cardiographic Results')
print(pd.crosstab(index=heart_data.num, columns=heart_data.restecg, values=heart_data.chol, aggfunc=np.mean))
Average Cholesterol Level Based on Target Variable and Chest Pain Type cp asymptomatic atypical angina non-anginal typical angina num 0 227.843137 233.957143 222.209677 222.730769 1 193.273684 250.157895 170.756757 215.250000 2 152.321839 123.000000 118.642857 58.500000 3 157.219512 200.000000 152.888889 228.666667 4 196.478261 NaN 146.000000 231.000000 Average Cholesterol Level Based on Target Variable and Patient Gender sex Female Male num 0 248.102190 217.054902 1 221.366667 191.820175 2 216.400000 136.381443 3 216.250000 155.102041 4 316.000000 182.240000 Average Cholesterol Level Based on Target Variable and Cardiographic Results restecg lv hypertrophy normal st-t abnormality num 0 251.768293 227.797619 194.637931 1 216.318182 194.243902 181.395833 2 231.666667 116.629630 132.187500 3 241.230769 130.408163 137.677419 4 238.166667 137.875000 175.285714
# Display correlation matrix and heatmap
corr = heart_data.corr()
print(corr)
sns.heatmap(corr)
plt.show()
age trestbps chol thalch oldpeak ca num age 1.000000 0.244253 -0.086234 -0.365778 0.258243 0.370416 0.339596 trestbps 0.244253 1.000000 0.092853 -0.104899 0.161908 0.093705 0.122291 chol -0.086234 0.092853 1.000000 0.236121 0.047734 0.051606 -0.231547 thalch -0.365778 -0.104899 0.236121 1.000000 -0.151174 -0.264094 -0.366265 oldpeak 0.258243 0.161908 0.047734 -0.151174 1.000000 0.281817 0.443084 ca 0.370416 0.093705 0.051606 -0.264094 0.281817 1.000000 0.516216 num 0.339596 0.122291 -0.231547 -0.366265 0.443084 0.516216 1.000000
# Display boxplot to visualize outliers in the data
heart_data.boxplot()
plt.show()
heart_data.loc[heart_data['chol']==0,:]
age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
597 | 32 | Male | typical angina | 95.0 | 0.0 | NaN | normal | 127.0 | False | 0.7 | upsloping | NaN | NaN | 1 |
598 | 34 | Male | asymptomatic | 115.0 | 0.0 | NaN | NaN | 154.0 | False | 0.2 | upsloping | NaN | NaN | 1 |
599 | 35 | Male | asymptomatic | NaN | 0.0 | NaN | normal | 130.0 | True | NaN | NaN | NaN | reversable defect | 3 |
600 | 36 | Male | asymptomatic | 110.0 | 0.0 | NaN | normal | 125.0 | True | 1.0 | flat | NaN | fixed defect | 1 |
601 | 38 | Female | asymptomatic | 105.0 | 0.0 | NaN | normal | 166.0 | False | 2.8 | upsloping | NaN | NaN | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
818 | 43 | Male | asymptomatic | 122.0 | 0.0 | False | normal | 120.0 | False | 0.5 | upsloping | NaN | NaN | 1 |
819 | 63 | Male | non-anginal | 130.0 | 0.0 | True | st-t abnormality | 160.0 | False | 3.0 | flat | NaN | NaN | 0 |
822 | 48 | Male | non-anginal | 102.0 | 0.0 | NaN | st-t abnormality | 110.0 | True | 1.0 | downsloping | NaN | NaN | 1 |
839 | 56 | Male | asymptomatic | NaN | 0.0 | False | lv hypertrophy | NaN | NaN | NaN | NaN | NaN | NaN | 1 |
840 | 62 | Male | non-anginal | NaN | 0.0 | True | st-t abnormality | NaN | NaN | NaN | NaN | NaN | NaN | 2 |
172 rows × 14 columns
heart_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 920 entries, 0 to 919 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 920 non-null int64 1 sex 920 non-null object 2 cp 920 non-null object 3 trestbps 861 non-null float64 4 chol 890 non-null float64 5 fbs 830 non-null object 6 restecg 918 non-null object 7 thalch 865 non-null float64 8 exang 865 non-null object 9 oldpeak 858 non-null float64 10 slope 611 non-null object 11 ca 309 non-null float64 12 thal 434 non-null object 13 num 920 non-null int64 dtypes: float64(5), int64(2), object(7) memory usage: 100.8+ KB
# Cholesterol Levels
median_chol = heart_data.loc[heart_data['chol']!=0, 'chol'].median()
heart_df = heart_data.fillna(value={'chol': median_chol})
heart_df.loc[heart_df['chol']==0, 'chol'] = median_chol
# Resting Blood Pressure
mean_bp = heart_df.loc[heart_df['trestbps']!=0,'trestbps'].mean()
heart_df = heart_df.fillna(value={'trestbps': mean_bp})
heart_df.loc[heart_df['trestbps']==0, 'trestbps'] = mean_bp
# Maximum Heart Rate
mean_hr = heart_df.loc[heart_df['thalch']!=0,'thalch'].mean()
heart_df = heart_df.fillna(value={'thalch': mean_hr})
heart_df.loc[heart_df['thalch']==0, 'thalch'] = mean_hr
# Old Peak
mean_peak = heart_df.oldpeak.mean()
heart_df = heart_df.fillna(value={'oldpeak': mean_peak})
heart_df.loc[heart_df['oldpeak']==0, 'oldpeak'] = mean_peak
# Drop columns with a great number of missing values and reassign datatypes
heart_df.drop(labels=['ca','thal','slope'], axis=1, inplace=True)
heart_df = heart_df.astype({'sex':'category', 'cp':'category', 'fbs':'bool', 'restecg':'category', 'exang':'bool'})
# Drop remaining rows with missing values and display distribution for target variables
heart_df.dropna(inplace=True)
sns.countplot('num', data=heart_df)
plt.show()
C:\Users\Koushik Reddy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# One hot encode the categorical variables and split the target and independent variables
heart_onehot = pd.get_dummies(heart_df, columns=['sex','cp', 'fbs', 'restecg', 'exang'])
X = heart_onehot.drop('num', axis=1)
y = heart_onehot.num
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train.value_counts()
0 340 1 204 3 86 2 82 4 22 Name: num, dtype: int64
heart_onehot.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 918 entries, 0 to 919 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 918 non-null int64 1 trestbps 918 non-null float64 2 chol 918 non-null float64 3 thalch 918 non-null float64 4 oldpeak 918 non-null float64 5 num 918 non-null int64 6 sex_Female 918 non-null uint8 7 sex_Male 918 non-null uint8 8 cp_asymptomatic 918 non-null uint8 9 cp_atypical angina 918 non-null uint8 10 cp_non-anginal 918 non-null uint8 11 cp_typical angina 918 non-null uint8 12 fbs_False 918 non-null uint8 13 fbs_True 918 non-null uint8 14 restecg_lv hypertrophy 918 non-null uint8 15 restecg_normal 918 non-null uint8 16 restecg_st-t abnormality 918 non-null uint8 17 exang_False 918 non-null uint8 18 exang_True 918 non-null uint8 dtypes: float64(4), int64(2), uint8(13) memory usage: 61.9 KB
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
weights = {0:1, 1:0.5, 2:0.5, 3:0.5, 4:0.5}
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.62 0.92 0.74 71 1 0.40 0.32 0.36 59 2 0.40 0.15 0.22 27 3 0.31 0.19 0.24 21 4 0.22 0.33 0.27 6 accuracy 0.51 184 macro avg 0.39 0.38 0.36 184 weighted avg 0.47 0.51 0.47 184
# Perform Decision Tree model with class weighting
weights = {0:1, 1:0.5, 2:0.5, 3:0.5, 4:0.5}
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, class_weight='balanced')
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.75 0.68 0.71 71 1 0.45 0.37 0.41 59 2 0.29 0.37 0.33 27 3 0.30 0.33 0.32 21 4 0.14 0.33 0.20 6 accuracy 0.48 184 macro avg 0.39 0.42 0.39 184 weighted avg 0.52 0.48 0.50 184
gradient_booster = GradientBoostingClassifier(learning_rate=0.02, max_depth=3, n_estimators=150)
gradient_booster.fit(X_train, y_train)
y_pred = gradient_booster.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.62 0.89 0.73 71 1 0.44 0.46 0.45 59 2 0.33 0.07 0.12 27 3 0.45 0.24 0.31 21 4 0.50 0.33 0.40 6 accuracy 0.54 184 macro avg 0.47 0.40 0.40 184 weighted avg 0.50 0.54 0.49 184
clf = RandomForestClassifier(n_estimators=150)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.61 0.87 0.72 71 1 0.48 0.42 0.45 59 2 0.23 0.11 0.15 27 3 0.31 0.24 0.27 21 4 0.00 0.00 0.00 6 accuracy 0.52 184 macro avg 0.33 0.33 0.32 184 weighted avg 0.46 0.52 0.47 184
clf = RandomForestClassifier(n_estimators=150, class_weight='balanced_subsample')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.59 0.86 0.70 71 1 0.47 0.37 0.42 59 2 0.29 0.15 0.20 27 3 0.30 0.29 0.29 21 4 0.00 0.00 0.00 6 accuracy 0.51 184 macro avg 0.33 0.33 0.32 184 weighted avg 0.45 0.51 0.47 184
C:\Users\Koushik Reddy\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Koushik Reddy\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Koushik Reddy\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
from imblearn.over_sampling import SMOTE
smt = SMOTE(sampling_strategy='not majority')
print('Before', y_train.value_counts())
X_train_SM, y_train_SM = smt.fit_resample(X_train, y_train)
val, counter = np.unique(y_train_SM, return_counts=True)
print('After', (val, counter))
Before 0 340 1 204 3 86 2 82 4 22 Name: num, dtype: int64 After (array([0, 1, 2, 3, 4], dtype=int64), array([340, 340, 340, 340, 340], dtype=int64))
clf = DecisionTreeClassifier(criterion='entropy', max_depth=6)
clf.fit(X_train_SM, y_train_SM)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.72 0.68 0.70 71 1 0.42 0.31 0.35 59 2 0.14 0.11 0.12 27 3 0.20 0.33 0.25 21 4 0.18 0.50 0.26 6 accuracy 0.43 184 macro avg 0.33 0.39 0.34 184 weighted avg 0.46 0.43 0.44 184