import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split

import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, explained_variance_score

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

df=pd.read_csv(r"C:\Users\Ismail\Desktop\heart_disease_uci.csv")

print(df.columns)

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

df

df.head(5)

df.describe

<bound method NDFrame.describe of       id  age     sex        dataset               cp  trestbps   chol    fbs  \
0      1   63    Male      Cleveland   typical angina     145.0  233.0   True   
1      2   67    Male      Cleveland     asymptomatic     160.0  286.0  False   
2      3   67    Male      Cleveland     asymptomatic     120.0  229.0  False   
3      4   37    Male      Cleveland      non-anginal     130.0  250.0  False   
4      5   41  Female      Cleveland  atypical angina     130.0  204.0  False   
..   ...  ...     ...            ...              ...       ...    ...    ...   
915  916   54  Female  VA Long Beach     asymptomatic     127.0  333.0   True   
916  917   62    Male  VA Long Beach   typical angina       NaN  139.0  False   
917  918   55    Male  VA Long Beach     asymptomatic     122.0  223.0   True   
918  919   58    Male  VA Long Beach     asymptomatic       NaN  385.0   True   
919  920   62    Male  VA Long Beach  atypical angina     120.0  254.0  False   

              restecg  thalch  exang  oldpeak        slope   ca  \
0      lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1      lv hypertrophy   108.0   True      1.5         flat  3.0   
2      lv hypertrophy   129.0   True      2.6         flat  2.0   
3              normal   187.0  False      3.5  downsloping  0.0   
4      lv hypertrophy   172.0  False      1.4    upsloping  0.0   
..                ...     ...    ...      ...          ...  ...   
915  st-t abnormality   154.0  False      0.0          NaN  NaN   
916  st-t abnormality     NaN    NaN      NaN          NaN  NaN   
917  st-t abnormality   100.0  False      0.0          NaN  NaN   
918    lv hypertrophy     NaN    NaN      NaN          NaN  NaN   
919    lv hypertrophy    93.0   True      0.0          NaN  NaN   

                  thal  num  
0         fixed defect    0  
1               normal    2  
2    reversable defect    1  
3               normal    0  
4               normal    0  
..                 ...  ...  
915                NaN    1  
916                NaN    0  
917       fixed defect    2  
918                NaN    0  
919                NaN    1  

[920 rows x 16 columns]>

df_n=df[(df.dataset=='Cleveland')]
df_n

df_Male= df_n[(df_n.sex=='Male')]
df_Male

df_Female= df_n[(df_n.sex=='Female')]
df_Female

data=df_n
data = data[(data['chol'] <= 420) & (data['oldpeak'] >=0) & (data['oldpeak'] <=4)].reset_index(drop=True)
data = data.dropna().reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        291 non-null    int64  
 1   age       291 non-null    int64  
 2   sex       291 non-null    object 
 3   dataset   291 non-null    object 
 4   cp        291 non-null    object 
 5   trestbps  291 non-null    float64
 6   chol      291 non-null    float64
 7   fbs       291 non-null    object 
 8   restecg   291 non-null    object 
 9   thalch    291 non-null    float64
 10  exang     291 non-null    object 
 11  oldpeak   291 non-null    float64
 12  slope     291 non-null    object 
 13  ca        291 non-null    float64
 14  thal      291 non-null    object 
 15  num       291 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 36.5+ KB

data.describe()

CATEGORICAL_COLS = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']
NUMERICAL_COLS = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']

heart_cat =data[CATEGORICAL_COLS]
heart_num = data[NUMERICAL_COLS]

heart_cat.nunique()

sex        2
cp         4
fbs        2
restecg    3
exang      2
slope      3
thal       3
ca         4
dtype: int64

fig, axes = plt.subplots(2, 4, figsize=(20,10))

sns.countplot(x='sex', data=heart_cat, ax=axes[0,0])
axes[0,0].set_title('Gender Distribution')

sns.countplot(x='cp', data=heart_cat, ax=axes[0,1])
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].set_title('Chest Pain Types')

sns.countplot(x='fbs', data=heart_cat, ax=axes[0,2])
axes[0,2].set_title('Fasting Blood Sugar > 120 mg/dl')

sns.countplot(x='restecg', data=heart_cat, ax=axes[0,3])
axes[0,3].set_title('Resting Electrocardiographic Results')

sns.countplot(x='exang', data=heart_cat, ax=axes[1,0])
axes[1,0].set_title('Exercise Induced Angina')

sns.countplot(x='slope', data=heart_cat, ax=axes[1,1])
axes[1,1].set_title('Slope of the Peak Exercise ST Segment')

sns.countplot(x='thal', data=heart_cat, ax=axes[1,2])
axes[1,2].set_title('Defects')

sns.countplot(x='ca', data=heart_cat, ax=axes[1,3])
axes[1,3].set_title('Number of Major Vessels colored by Fluoroscopy')
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(2, 2, figsize=(10,10))

heart_num.plot('age', 'chol', kind='scatter', ax=axes[0,0])
axes[0,0].set_title('Age Against Cholesterol Levels')

heart_num.plot('age', 'trestbps', kind='scatter', ax=axes[0,1])
axes[0,1].set_title('Age Against Resting Blood Pressure')

heart_num.plot('age', 'thalch', kind='scatter', ax=axes[1,0])
axes[1,0].set_title('Age Against Maximum Heart Rate Achieved')

heart_num.plot('age', 'oldpeak', kind='scatter', ax=axes[1,1])
axes[1,1].set_title('Age Against ST Depression')

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(3, figsize=(7,10))

sns.scatterplot(x='chol', y='thalch', hue='num', data=data, ax=axes[0])
axes[0].set_title('Affect of Cholesterol on Maximum Heart Rate')

sns.scatterplot(x='chol', y='thalch', hue='sex', data=data, ax=axes[1])

sns.scatterplot(x='chol', y='thalch', hue='restecg', data=data, ax=axes[2])
plt.show()

sns.scatterplot(x='trestbps', y='thalch', hue='restecg', data=data)
plt.show()

fig, axes = plt.subplots(3, figsize=(7,10))

axes[0].set_title('Affect of Cholesterol on Resting Blood Pressure')
sns.scatterplot(x='chol', y='trestbps', hue='num', data=data, ax=axes[0])
sns.scatterplot(x='chol', y='trestbps', hue='sex', data=data, ax=axes[1])
sns.scatterplot(x='chol', y='trestbps', hue='restecg', data=data, ax=axes[2])

plt.tight_layout()
plt.show()

data.groupby('num').mean()

print('Average Cholesterol Level Based on Target Variable and Chest Pain Type')
print(pd.crosstab(index=data.num, columns=data.cp, values=data.chol, aggfunc=np.mean))
print('\n')

print('Average Cholesterol Level Based on Target Variable and Patient Gender')
print(pd.crosstab(index=data.num, columns=data.sex, values=data.chol, aggfunc=np.mean))
print('\n')

print('Average Cholesterol Level Based on Target Variable and Cardiographic Results')
print(pd.crosstab(index=data.num, columns=data.restecg, values=data.chol, aggfunc=np.mean))

Average Cholesterol Level Based on Target Variable and Chest Pain Type
cp   asymptomatic  atypical angina  non-anginal  typical angina
num                                                            
0      245.333333       241.200000   242.203125           227.2
1      249.764706       254.166667   236.222222           262.2
2      264.103448       246.000000   247.750000           234.0
3      245.846154       300.000000   238.750000             NaN
4      251.500000              NaN   230.000000           231.0


Average Cholesterol Level Based on Target Variable and Patient Gender
sex      Female        Male
num                        
0    252.942857  232.034091
1    264.222222  246.133333
2    309.142857  248.785714
3    261.000000  245.423077
4    316.000000  234.400000


Average Cholesterol Level Based on Target Variable and Cardiographic Results
restecg  lv hypertrophy      normal  st-t abnormality
num                                                  
0            250.661538  235.163043             197.0
1            249.290323  248.956522               NaN
2            274.437500  245.111111             327.0
3            255.571429  237.500000             205.0
4            257.100000  202.500000               NaN

corr = data.corr()
print(corr)

sns.heatmap(corr)
plt.show()

                id       age  trestbps      chol    thalch   oldpeak  \
id        1.000000  0.009105 -0.017465 -0.106741 -0.116757 -0.138977   
age       0.009105  1.000000  0.295220  0.193051 -0.401816  0.198531   
trestbps -0.017465  0.295220  1.000000  0.175936 -0.042743  0.163879   
chol     -0.106741  0.193051  0.175936  1.000000 -0.008048  0.040159   
thalch   -0.116757 -0.401816 -0.042743 -0.008048  1.000000 -0.344198   
oldpeak  -0.138977  0.198531  0.163879  0.040159 -0.344198  1.000000   
ca        0.039114  0.374728  0.099386  0.138833 -0.268102  0.262771   
num       0.024629  0.228222  0.165492  0.087107 -0.413299  0.485032   

                ca       num  
id        0.039114  0.024629  
age       0.374728  0.228222  
trestbps  0.099386  0.165492  
chol      0.138833  0.087107  
thalch   -0.268102 -0.413299  
oldpeak   0.262771  0.485032  
ca        1.000000  0.503025  
num       0.503025  1.000000

data.boxplot()
plt.show()

	id	age	trestbps	chol	thalch	oldpeak	ca	num
count	291.000000	291.000000	291.000000	291.000000	291.000000	291.000000	291.000000	291.000000
mean	150.237113	54.457045	131.498282	246.158076	149.855670	0.987285	0.659794	0.920962
std	87.725250	9.094184	17.587883	48.554652	22.989948	1.055009	0.915987	1.213689
min	1.000000	29.000000	94.000000	126.000000	71.000000	0.000000	0.000000	0.000000
25%	73.500000	47.500000	120.000000	211.000000	133.500000	0.000000	0.000000	0.000000
50%	149.000000	56.000000	130.000000	242.000000	153.000000	0.800000	0.000000	0.000000
75%	226.500000	61.000000	140.000000	275.000000	166.000000	1.600000	1.000000	2.000000
max	302.000000	77.000000	200.000000	417.000000	202.000000	4.000000	3.000000	4.000000

	id	age	trestbps	chol	thalch	oldpeak	ca
num
0	146.208861	52.512658	128.955696	241.297468	158.658228	0.569620	0.278481
1	152.185185	55.611111	133.277778	249.148148	145.981481	1.022222	0.740741
2	169.371429	58.200000	134.371429	260.857143	135.000000	1.802857	1.257143
3	154.312500	56.000000	134.406250	248.343750	132.625000	1.646875	1.406250
4	127.833333	59.833333	140.833333	248.000000	140.666667	2.191667	1.583333

	id	age	sex	dataset	cp	trestbps	chol	fbs	restecg	thalch	exang	oldpeak	slope	ca	thal	num
0	1	63	Male	Cleveland	typical angina	145.0	233.0	True	lv hypertrophy	150.0	False	2.3	downsloping	0.0	fixed defect	0
1	2	67	Male	Cleveland	asymptomatic	160.0	286.0	False	lv hypertrophy	108.0	True	1.5	flat	3.0	normal	2
2	3	67	Male	Cleveland	asymptomatic	120.0	229.0	False	lv hypertrophy	129.0	True	2.6	flat	2.0	reversable defect	1
3	4	37	Male	Cleveland	non-anginal	130.0	250.0	False	normal	187.0	False	3.5	downsloping	0.0	normal	0
4	5	41	Female	Cleveland	atypical angina	130.0	204.0	False	lv hypertrophy	172.0	False	1.4	upsloping	0.0	normal	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
915	916	54	Female	VA Long Beach	asymptomatic	127.0	333.0	True	st-t abnormality	154.0	False	0.0	NaN	NaN	NaN	1
916	917	62	Male	VA Long Beach	typical angina	NaN	139.0	False	st-t abnormality	NaN	NaN	NaN	NaN	NaN	NaN	0
917	918	55	Male	VA Long Beach	asymptomatic	122.0	223.0	True	st-t abnormality	100.0	False	0.0	NaN	NaN	fixed defect	2
918	919	58	Male	VA Long Beach	asymptomatic	NaN	385.0	True	lv hypertrophy	NaN	NaN	NaN	NaN	NaN	NaN	0
919	920	62	Male	VA Long Beach	atypical angina	120.0	254.0	False	lv hypertrophy	93.0	True	0.0	NaN	NaN	NaN	1