This is a multivariate type of dataset which means providing or involving a variety of separate mathematical or statistical variables, multivariate numerical data analysis. It is composed of 14 attributes which are age, sex, chest pain type, resting blood pressure, serum cholesterol, fasting blood sugar, resting electrocardiographic results, maximum heart rate achieved, exercise-induced angina, oldpeak — ST depression induced by exercise relative to rest, the slope of the peak exercise ST segment, number of major vessels and Thalassemia. This database includes 76 attributes, but all published studies relate to the use of a subset of 14 of them. The Cleveland database is the only one used by ML researchers to date. One of the major tasks on this dataset is to predict based on the given attributes of a patient that whether that particular person has heart disease or not and other is the experimental task to diagnose and find out various insights from this dataset which could help in understanding the problem more.
#import the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
#load the dataset
import pandas as pd
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
df = pd.read_csv(url, header=None, na_values='?')
df.columns = [
'age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'serum_cholesterol',
'fasting_blood_sugar', 'resting_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina',
'oldpeak', 'slope', 'num_major_vessels', 'thalassemia', 'target'
]
# Handle missing values
df = df.dropna()
# Preprocess the data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Define models to train and evaluate
models = [
{'name': 'Logistic Regression', 'model': LogisticRegression(), 'params': {'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10, 100]}},
{'name': 'Ridge Regression', 'model': Ridge(), 'params': {'alpha': [0.1, 1, 10, 100]}},
{'name': 'Lasso Regression', 'model': Lasso(), 'params': {'alpha': [0.1, 1, 10, 100]}},
{'name': 'Decision Tree Classifier', 'model': DecisionTreeClassifier(), 'params': {'max_depth': [3, 5, 7, 10]}},
{'name': 'Random Forest Classifier', 'model': RandomForestClassifier(), 'params': {'n_estimators': [50, 100, 150]}},
{'name': 'Decision Tree Regressor', 'model': DecisionTreeRegressor(), 'params': {'max_depth': [3, 5, 7, 10]}},
{'name': 'Random Forest Regressor', 'model': RandomForestRegressor(), 'params': {'n_estimators': [50, 100, 150]}},
]
# Train and evaluate the models
for model in models:
print(f'Training {model["name"]}...')
grid_search = GridSearchCV(model["model"], model["params"], cv=5, scoring='accuracy' if 'Classifier' in model['name'] else 'r2')
grid_search.fit(X_train_scaled, y_train)
y_pred = grid_search.predict(X_test_scaled)
score = accuracy_score(y_test, y_pred) if 'Classifier' in model['name'] else r2_score(y_test, y_pred)
print(f'Test {model["name"]} score: {score:.3f}')
print(f'Best {model["name"]} parameters: {grid_search.best_params_}\n')