In [27]:
import numpy as np
import pandas as pd
import math
import random
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


import warnings
warnings.filterwarnings("ignore")
In [2]:
data = pd.read_csv("heart_disease_uci.csv")
In [3]:
data.head()
Out[3]:
id age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
0 1 63 Male Cleveland typical angina 145.0 233.0 True lv hypertrophy 150.0 False 2.3 downsloping 0.0 fixed defect 0
1 2 67 Male Cleveland asymptomatic 160.0 286.0 False lv hypertrophy 108.0 True 1.5 flat 3.0 normal 2
2 3 67 Male Cleveland asymptomatic 120.0 229.0 False lv hypertrophy 129.0 True 2.6 flat 2.0 reversable defect 1
3 4 37 Male Cleveland non-anginal 130.0 250.0 False normal 187.0 False 3.5 downsloping 0.0 normal 0
4 5 41 Female Cleveland atypical angina 130.0 204.0 False lv hypertrophy 172.0 False 1.4 upsloping 0.0 normal 0
In [4]:
data['condition'] = data['num']
In [5]:
data = data.drop(columns = ['id','dataset','ca','thal','num'])
In [6]:
data.head()
Out[6]:
age sex cp trestbps chol fbs restecg thalch exang oldpeak slope condition
0 63 Male typical angina 145.0 233.0 True lv hypertrophy 150.0 False 2.3 downsloping 0
1 67 Male asymptomatic 160.0 286.0 False lv hypertrophy 108.0 True 1.5 flat 2
2 67 Male asymptomatic 120.0 229.0 False lv hypertrophy 129.0 True 2.6 flat 1
3 37 Male non-anginal 130.0 250.0 False normal 187.0 False 3.5 downsloping 0
4 41 Female atypical angina 130.0 204.0 False lv hypertrophy 172.0 False 1.4 upsloping 0
In [7]:
data = data[(data['chol'] <= 420) & (data['oldpeak'] >=0) & (data['oldpeak'] <=4)].reset_index(drop=True)
data = data.dropna().reset_index(drop=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        520 non-null    int64  
 1   sex        520 non-null    object 
 2   cp         520 non-null    object 
 3   trestbps   520 non-null    float64
 4   chol       520 non-null    float64
 5   fbs        520 non-null    object 
 6   restecg    520 non-null    object 
 7   thalch     520 non-null    float64
 8   exang      520 non-null    object 
 9   oldpeak    520 non-null    float64
 10  slope      520 non-null    object 
 11  condition  520 non-null    int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 48.9+ KB
In [8]:
def str_features_to_numeric(data):
    
    categorical_columns = []
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    features = data.columns.values.tolist()
    for col in features:
        if data[col].dtype in numerics: continue
        categorical_columns.append(col)
    
    # Encoding categorical features
    for col in categorical_columns:
        if col in data.columns:
            le = LabelEncoder()
            le.fit(list(data[col].astype(str).values))
            data[col] = le.transform(list(data[col].astype(str).values))
    
    return data
In [9]:
data = str_features_to_numeric(data)
data
Out[9]:
age sex cp trestbps chol fbs restecg thalch exang oldpeak slope condition
0 63 1 3 145.0 233.0 1 0 150.0 0 2.3 0 0
1 67 1 0 160.0 286.0 0 0 108.0 1 1.5 1 2
2 67 1 0 120.0 229.0 0 0 129.0 1 2.6 1 1
3 37 1 2 130.0 250.0 0 1 187.0 0 3.5 0 0
4 41 0 1 130.0 204.0 0 0 172.0 0 1.4 2 0
... ... ... ... ... ... ... ... ... ... ... ... ...
515 57 1 0 130.0 207.0 0 2 96.0 1 1.0 1 0
516 74 1 0 155.0 310.0 0 1 112.0 1 1.5 0 2
517 51 0 0 114.0 258.0 1 0 96.0 0 1.0 2 0
518 62 1 0 160.0 254.0 1 2 108.0 1 3.0 1 4
519 53 1 0 144.0 300.0 1 2 128.0 1 1.5 1 3

520 rows × 12 columns

In [11]:
data.condition.value_counts()
Out[11]:
0    203
1    159
2     70
3     68
4     20
Name: condition, dtype: int64
In [13]:
data = data[data['condition'].isin([0, 1])]
data
Out[13]:
age sex cp trestbps chol fbs restecg thalch exang oldpeak slope condition
0 63 1 3 145.0 233.0 1 0 150.0 0 2.3 0 0
2 67 1 0 120.0 229.0 0 0 129.0 1 2.6 1 1
3 37 1 2 130.0 250.0 0 1 187.0 0 3.5 0 0
4 41 0 1 130.0 204.0 0 0 172.0 0 1.4 2 0
5 56 1 1 120.0 236.0 0 1 178.0 0 0.8 2 0
... ... ... ... ... ... ... ... ... ... ... ... ...
512 60 1 0 130.0 186.0 1 0 140.0 1 0.5 1 1
513 55 1 0 120.0 226.0 0 0 127.0 1 1.7 0 1
514 56 1 0 130.0 203.0 1 1 98.0 0 1.5 1 1
515 57 1 0 130.0 207.0 0 2 96.0 1 1.0 1 0
517 51 0 0 114.0 258.0 1 0 96.0 0 1.0 2 0

362 rows × 12 columns

In [14]:
data.isnull().sum()
Out[14]:
age          0
sex          0
cp           0
trestbps     0
chol         0
fbs          0
restecg      0
thalch       0
exang        0
oldpeak      0
slope        0
condition    0
dtype: int64
In [15]:
data.shape
Out[15]:
(362, 12)
In [16]:
data['condition'].value_counts()
Out[16]:
0    203
1    159
Name: condition, dtype: int64
In [17]:
X = data.drop(columns='condition', axis=1)
Y = data['condition']
print(X)
     age  sex  cp  trestbps   chol  fbs  restecg  thalch  exang  oldpeak  \
0     63    1   3     145.0  233.0    1        0   150.0      0      2.3   
2     67    1   0     120.0  229.0    0        0   129.0      1      2.6   
3     37    1   2     130.0  250.0    0        1   187.0      0      3.5   
4     41    0   1     130.0  204.0    0        0   172.0      0      1.4   
5     56    1   1     120.0  236.0    0        1   178.0      0      0.8   
..   ...  ...  ..       ...    ...  ...      ...     ...    ...      ...   
512   60    1   0     130.0  186.0    1        0   140.0      1      0.5   
513   55    1   0     120.0  226.0    0        0   127.0      1      1.7   
514   56    1   0     130.0  203.0    1        1    98.0      0      1.5   
515   57    1   0     130.0  207.0    0        2    96.0      1      1.0   
517   51    0   0     114.0  258.0    1        0    96.0      0      1.0   

     slope  
0        0  
2        1  
3        0  
4        2  
5        2  
..     ...  
512      1  
513      0  
514      1  
515      1  
517      2  

[362 rows x 11 columns]
In [18]:
print(Y)
0      0
2      1
3      0
4      0
5      0
      ..
512    1
513    1
514    1
515    0
517    0
Name: condition, Length: 362, dtype: int64
In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
In [20]:
print(X.shape, X_train.shape, X_test.shape)
(362, 11) (289, 11) (73, 11)
In [22]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression()
model.fit(X_train, Y_train)
Out[22]:
LogisticRegression()
In [23]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
In [24]:
print('Accuracy on Training data : ', training_data_accuracy)
Accuracy on Training data :  0.7958477508650519
In [25]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
In [26]:
print('Accuracy on Test data : ', test_data_accuracy)
Accuracy on Test data :  0.7808219178082192
In [29]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
Out[29]:
KNeighborsClassifier()
In [31]:
print ('Accuracy of KNN n-5, on the training set : {:.3f}'.format(knn.score(X_train, Y_train)))
print ('Accuracy of KNN n-5, on the testing set : {:.3f}'.format(knn.score(X_test, Y_test)))
Accuracy of KNN n-5, on the training set : 0.758
Accuracy of KNN n-5, on the testing set : 0.644
In [32]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform (X_test)
In [33]:
print ('Accuracy of KNN n-5, on the training set : {:.3f}'.format(knn.score(X_train, Y_train)))
print ('Accuracy of KNN n-5, on the testing set : {:.3f}'.format(knn.score(X_test, Y_test)))
Accuracy of KNN n-5, on the training set : 0.439
Accuracy of KNN n-5, on the testing set : 0.438
In [34]:
knn.predict(X_test)
Out[34]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1], dtype=int64)
In [35]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
reg.fit(X,Y)
Out[35]:
RandomForestRegressor()
In [37]:
col = data.columns.tolist()[:-1]
df_feature_importance = pd.DataFrame (reg.feature_importances_, index = col, columns= ['feature importance']).sort_values('feature importance',ascending = False)
df_feature_importance.plot(kind = 'bar')
Out[37]:
<AxesSubplot:>
In [ ]: