import numpy as np
import pandas as pd
import math
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
data = pd.read_csv("heart_disease_uci.csv")
id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
2 | 3 | 67 | Male | Cleveland | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | Cleveland | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | Cleveland | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
data['condition'] = data['num']
data = data.drop(columns = ['id','dataset','ca','thal','num'])
age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | condition | |
0 | 63 | Male | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0 |
1 | 67 | Male | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 2 |
2 | 67 | Male | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 1 |
3 | 37 | Male | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0 |
4 | 41 | Female | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0 |
data = data[(data['chol'] <= 420) & (data['oldpeak'] >=0) & (data['oldpeak'] <=4)].reset_index(drop=True)
data = data.dropna().reset_index(drop=True)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 520 entries, 0 to 519 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 520 non-null int64 1 sex 520 non-null object 2 cp 520 non-null object 3 trestbps 520 non-null float64 4 chol 520 non-null float64 5 fbs 520 non-null object 6 restecg 520 non-null object 7 thalch 520 non-null float64 8 exang 520 non-null object 9 oldpeak 520 non-null float64 10 slope 520 non-null object 11 condition 520 non-null int64 dtypes: float64(4), int64(2), object(6) memory usage: 48.9+ KB
def str_features_to_numeric(data):
categorical_columns = []
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
features = data.columns.values.tolist()
for col in features:
if data[col].dtype in numerics: continue
# Encoding categorical features
for col in categorical_columns:
if col in data.columns:
le = LabelEncoder()[col].astype(str).values))
data[col] = le.transform(list(data[col].astype(str).values))
return data
data = str_features_to_numeric(data)
age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | condition | |
0 | 63 | 1 | 3 | 145.0 | 233.0 | 1 | 0 | 150.0 | 0 | 2.3 | 0 | 0 |
1 | 67 | 1 | 0 | 160.0 | 286.0 | 0 | 0 | 108.0 | 1 | 1.5 | 1 | 2 |
2 | 67 | 1 | 0 | 120.0 | 229.0 | 0 | 0 | 129.0 | 1 | 2.6 | 1 | 1 |
3 | 37 | 1 | 2 | 130.0 | 250.0 | 0 | 1 | 187.0 | 0 | 3.5 | 0 | 0 |
4 | 41 | 0 | 1 | 130.0 | 204.0 | 0 | 0 | 172.0 | 0 | 1.4 | 2 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
515 | 57 | 1 | 0 | 130.0 | 207.0 | 0 | 2 | 96.0 | 1 | 1.0 | 1 | 0 |
516 | 74 | 1 | 0 | 155.0 | 310.0 | 0 | 1 | 112.0 | 1 | 1.5 | 0 | 2 |
517 | 51 | 0 | 0 | 114.0 | 258.0 | 1 | 0 | 96.0 | 0 | 1.0 | 2 | 0 |
518 | 62 | 1 | 0 | 160.0 | 254.0 | 1 | 2 | 108.0 | 1 | 3.0 | 1 | 4 |
519 | 53 | 1 | 0 | 144.0 | 300.0 | 1 | 2 | 128.0 | 1 | 1.5 | 1 | 3 |
520 rows × 12 columns
0 203 1 159 2 70 3 68 4 20 Name: condition, dtype: int64
data = data[data['condition'].isin([0, 1])]
age | sex | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | condition | |
0 | 63 | 1 | 3 | 145.0 | 233.0 | 1 | 0 | 150.0 | 0 | 2.3 | 0 | 0 |
2 | 67 | 1 | 0 | 120.0 | 229.0 | 0 | 0 | 129.0 | 1 | 2.6 | 1 | 1 |
3 | 37 | 1 | 2 | 130.0 | 250.0 | 0 | 1 | 187.0 | 0 | 3.5 | 0 | 0 |
4 | 41 | 0 | 1 | 130.0 | 204.0 | 0 | 0 | 172.0 | 0 | 1.4 | 2 | 0 |
5 | 56 | 1 | 1 | 120.0 | 236.0 | 0 | 1 | 178.0 | 0 | 0.8 | 2 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
512 | 60 | 1 | 0 | 130.0 | 186.0 | 1 | 0 | 140.0 | 1 | 0.5 | 1 | 1 |
513 | 55 | 1 | 0 | 120.0 | 226.0 | 0 | 0 | 127.0 | 1 | 1.7 | 0 | 1 |
514 | 56 | 1 | 0 | 130.0 | 203.0 | 1 | 1 | 98.0 | 0 | 1.5 | 1 | 1 |
515 | 57 | 1 | 0 | 130.0 | 207.0 | 0 | 2 | 96.0 | 1 | 1.0 | 1 | 0 |
517 | 51 | 0 | 0 | 114.0 | 258.0 | 1 | 0 | 96.0 | 0 | 1.0 | 2 | 0 |
362 rows × 12 columns
age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalch 0 exang 0 oldpeak 0 slope 0 condition 0 dtype: int64
(362, 12)
0 203 1 159 Name: condition, dtype: int64
X = data.drop(columns='condition', axis=1)
Y = data['condition']
age sex cp trestbps chol fbs restecg thalch exang oldpeak \ 0 63 1 3 145.0 233.0 1 0 150.0 0 2.3 2 67 1 0 120.0 229.0 0 0 129.0 1 2.6 3 37 1 2 130.0 250.0 0 1 187.0 0 3.5 4 41 0 1 130.0 204.0 0 0 172.0 0 1.4 5 56 1 1 120.0 236.0 0 1 178.0 0 0.8 .. ... ... .. ... ... ... ... ... ... ... 512 60 1 0 130.0 186.0 1 0 140.0 1 0.5 513 55 1 0 120.0 226.0 0 0 127.0 1 1.7 514 56 1 0 130.0 203.0 1 1 98.0 0 1.5 515 57 1 0 130.0 207.0 0 2 96.0 1 1.0 517 51 0 0 114.0 258.0 1 0 96.0 0 1.0 slope 0 0 2 1 3 0 4 2 5 2 .. ... 512 1 513 0 514 1 515 1 517 2 [362 rows x 11 columns]
0 0 2 1 3 0 4 0 5 0 .. 512 1 513 1 514 1 515 0 517 0 Name: condition, Length: 362, dtype: int64
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)
(362, 11) (289, 11) (73, 11)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression(), Y_train)
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)
Accuracy on Training data : 0.7958477508650519
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)
Accuracy on Test data : 0.7808219178082192
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(), Y_train)
print ('Accuracy of KNN n-5, on the training set : {:.3f}'.format(knn.score(X_train, Y_train)))
print ('Accuracy of KNN n-5, on the testing set : {:.3f}'.format(knn.score(X_test, Y_test)))
Accuracy of KNN n-5, on the training set : 0.758 Accuracy of KNN n-5, on the testing set : 0.644
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform (X_test)
print ('Accuracy of KNN n-5, on the training set : {:.3f}'.format(knn.score(X_train, Y_train)))
print ('Accuracy of KNN n-5, on the testing set : {:.3f}'.format(knn.score(X_test, Y_test)))
Accuracy of KNN n-5, on the training set : 0.439 Accuracy of KNN n-5, on the testing set : 0.438
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor(),Y)
col = data.columns.tolist()[:-1]
df_feature_importance = pd.DataFrame (reg.feature_importances_, index = col, columns= ['feature importance']).sort_values('feature importance',ascending = False)
df_feature_importance.plot(kind = 'bar')