import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, explained_variance_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
df=pd.read_csv(r"C:\Users\Ismail\Desktop\heart_disease_uci.csv")
print(df.columns)
df
df.head(5)
df.describe
df_n=df[(df.dataset=='Cleveland')]
df_n
df_Male= df_n[(df_n.sex=='Male')]
df_Male
df_Female= df_n[(df_n.sex=='Female')]
df_Female
data=df_n
data = data[(data['chol'] <= 420) & (data['oldpeak'] >=0) & (data['oldpeak'] <=4)].reset_index(drop=True)
data = data.dropna().reset_index(drop=True)
data.info()
data.describe()
CATEGORICAL_COLS = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']
NUMERICAL_COLS = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
heart_cat =data[CATEGORICAL_COLS]
heart_num = data[NUMERICAL_COLS]
heart_cat.nunique()
fig, axes = plt.subplots(2, 4, figsize=(20,10))
sns.countplot(x='sex', data=heart_cat, ax=axes[0,0])
axes[0,0].set_title('Gender Distribution')
sns.countplot(x='cp', data=heart_cat, ax=axes[0,1])
axes[0,1].tick_params(axis='x', rotation=45)
axes[0,1].set_title('Chest Pain Types')
sns.countplot(x='fbs', data=heart_cat, ax=axes[0,2])
axes[0,2].set_title('Fasting Blood Sugar > 120 mg/dl')
sns.countplot(x='restecg', data=heart_cat, ax=axes[0,3])
axes[0,3].set_title('Resting Electrocardiographic Results')
sns.countplot(x='exang', data=heart_cat, ax=axes[1,0])
axes[1,0].set_title('Exercise Induced Angina')
sns.countplot(x='slope', data=heart_cat, ax=axes[1,1])
axes[1,1].set_title('Slope of the Peak Exercise ST Segment')
sns.countplot(x='thal', data=heart_cat, ax=axes[1,2])
axes[1,2].set_title('Defects')
sns.countplot(x='ca', data=heart_cat, ax=axes[1,3])
axes[1,3].set_title('Number of Major Vessels colored by Fluoroscopy')
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(2, 2, figsize=(10,10))
heart_num.plot('age', 'chol', kind='scatter', ax=axes[0,0])
axes[0,0].set_title('Age Against Cholesterol Levels')
heart_num.plot('age', 'trestbps', kind='scatter', ax=axes[0,1])
axes[0,1].set_title('Age Against Resting Blood Pressure')
heart_num.plot('age', 'thalch', kind='scatter', ax=axes[1,0])
axes[1,0].set_title('Age Against Maximum Heart Rate Achieved')
heart_num.plot('age', 'oldpeak', kind='scatter', ax=axes[1,1])
axes[1,1].set_title('Age Against ST Depression')
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(3, figsize=(7,10))
sns.scatterplot(x='chol', y='thalch', hue='num', data=data, ax=axes[0])
axes[0].set_title('Affect of Cholesterol on Maximum Heart Rate')
sns.scatterplot(x='chol', y='thalch', hue='sex', data=data, ax=axes[1])
sns.scatterplot(x='chol', y='thalch', hue='restecg', data=data, ax=axes[2])
plt.show()
sns.scatterplot(x='trestbps', y='thalch', hue='restecg', data=data)
plt.show()
fig, axes = plt.subplots(3, figsize=(7,10))
axes[0].set_title('Affect of Cholesterol on Resting Blood Pressure')
sns.scatterplot(x='chol', y='trestbps', hue='num', data=data, ax=axes[0])
sns.scatterplot(x='chol', y='trestbps', hue='sex', data=data, ax=axes[1])
sns.scatterplot(x='chol', y='trestbps', hue='restecg', data=data, ax=axes[2])
plt.tight_layout()
plt.show()
data.groupby('num').mean()
print('Average Cholesterol Level Based on Target Variable and Chest Pain Type')
print(pd.crosstab(index=data.num, columns=data.cp, values=data.chol, aggfunc=np.mean))
print('\n')
print('Average Cholesterol Level Based on Target Variable and Patient Gender')
print(pd.crosstab(index=data.num, columns=data.sex, values=data.chol, aggfunc=np.mean))
print('\n')
print('Average Cholesterol Level Based on Target Variable and Cardiographic Results')
print(pd.crosstab(index=data.num, columns=data.restecg, values=data.chol, aggfunc=np.mean))
corr = data.corr()
print(corr)
sns.heatmap(corr)
plt.show()
data.boxplot()
plt.show()