#import libraries 
import numpy as np
import pandas as pd
import math
import random
import seaborn as sns
import pandas_profiling as pp
import matplotlib.pyplot as plt
%matplotlib inline


conda install -c conda-forge pandas-profiling

Collecting package metadata (current_repodata.json): ...working... done
Note: you may need to restart the kernel to use updated packages.


==> WARNING: A newer version of conda exists. <==
  current version: 4.14.0
  latest version: 22.11.1

Please update conda by running

    $ conda update -n base -c conda-forge conda

Solving environment: ...working... done

# All requested packages already installed.

Retrieving notices: ...working... done


# Preprocessing
import sklearn
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, learning_curve, ShuffleSplit
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, confusion_matrix, explained_variance_score


# Models
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-3-b93e74159b53> in <module>
      5 from sklearn.tree import DecisionTreeClassifier, plot_tree
      6 from sklearn import metrics
----> 7 import xgboost as xgb
      8 from xgboost import XGBClassifier
      9 import lightgbm as lgb

ModuleNotFoundError: No module named 'xgboost'


import warnings
warnings.filterwarnings("ignore")


cv_n_split = 5
random_state = 42
test_train_split_part = 0.25
metrics_all = {1: 'acc', 2 : 'rmse', 3 : 're'}
metrics_now = [1, 2, 3] # you can only select some numbers of metrics from metrics_all
data = pd.read_csv("E:\DSPP\Assignments\JNTUH ML DL assignment 3/heart_disease_uci.csv")
data['target'] = data['num']
data = data.drop(columns=['id', 'dataset', 'ca', 'thal', 'num'])
data.head(3)


data = data[(data['chol'] <= 420) & (data['oldpeak'] >=0) & (data['oldpeak'] <=4)].reset_index(drop=True)
data = data.dropna().reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       520 non-null    int64  
 1   sex       520 non-null    object 
 2   cp        520 non-null    object 
 3   trestbps  520 non-null    float64
 4   chol      520 non-null    float64
 5   fbs       520 non-null    object 
 6   restecg   520 non-null    object 
 7   thalch    520 non-null    float64
 8   exang     520 non-null    object 
 9   oldpeak   520 non-null    float64
 10  slope     520 non-null    object 
 11  target    520 non-null    int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 48.9+ KB


# get count, mean, STD, min, 25%, 50%, 75%, max etc.,
data.describe()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       520 non-null    int64  
 1   sex       520 non-null    object 
 2   cp        520 non-null    object 
 3   trestbps  520 non-null    float64
 4   chol      520 non-null    float64
 5   fbs       520 non-null    object 
 6   restecg   520 non-null    object 
 7   thalch    520 non-null    float64
 8   exang     520 non-null    object 
 9   oldpeak   520 non-null    float64
 10  slope     520 non-null    object 
 11  target    520 non-null    int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 48.9+ KB


def str_features_to_numeric(data):
    # Transforms all string features of the dataframe to numeric features
    
    # Determination categorical features
    categorical_columns = []
    numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    features = data.columns.values.tolist()
    for col in features:
        if data[col].dtype in numerics: continue
        categorical_columns.append(col)
        
     # Encoding categorical features
    for col in categorical_columns:
        if col in data.columns:
            le = LabelEncoder()
            le.fit(list(data[col].astype(str).values))
            data[col] = le.transform(list(data[col].astype(str).values))
    
    return data


# Transform all string features of the dataframe to numeric features
data=str_features_to_numeric(data)
data


data.target.value_counts()

0    203
1    159
2     70
3     68
4     20
Name: target, dtype: int64


# target = 0 or 1 ==> more data
data = data[data['target'].isin([0, 1])]
data


def fe_creation(df):
    df['age2'] = df['age']//10
    df['trestbps2'] = df['trestbps']//10
    df['chol2'] = df['chol']//60
    df['thalch2'] = df['thalch']//40
    df['oldpeak2'] = df['oldpeak']//0.4
    for i in ['sex', 'age2', 'fbs', 'restecg', 'exang']:
        for j in ['cp','trestbps2', 'chol2', 'thalch2', 'oldpeak2', 'slope']:
            df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')
    return df


data = fe_creation(data)
# Transform all string features of the df to numeric features
pd.set_option('max_columns', len(data.columns)+1)
data = str_features_to_numeric(data)
data.head(3)

<ipython-input-13-fdccd21e48f1>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age2'] = df['age']//10
<ipython-input-13-fdccd21e48f1>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trestbps2'] = df['trestbps']//10
<ipython-input-13-fdccd21e48f1>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['chol2'] = df['chol']//60
<ipython-input-13-fdccd21e48f1>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['thalch2'] = df['thalch']//40
<ipython-input-13-fdccd21e48f1>:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['oldpeak2'] = df['oldpeak']//0.4
<ipython-input-13-fdccd21e48f1>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i + "_" + j] = df[i].astype('str') + "_" + df[j].astype('str')
<ipython-input-8-681ce7e71d78>:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = le.transform(list(data[col].astype(str).values))


data.shape

(362, 47)


features_best = data.columns.tolist()
pd.set_option('max_columns', len(features_best)+1)
#Pandas Profiling
pp.ProfileReport(data[features_best])
#Pandas Describe
data.describe()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-3459cf5cb4a2> in <module>
      2 pd.set_option('max_columns', len(features_best)+1)
      3 #Pandas Profiling
----> 4 pp.ProfileReport(data[features_best])
      5 #Pandas Describe
      6 data.describe()

~\Anaconda3\lib\site-packages\pandas_profiling\__init__.py in __init__(self, df, **kwargs)
     64         sample = kwargs.get('sample', df.head())
     65 
---> 66         description_set = describe(df, **kwargs)
     67 
     68         self.html = to_html(sample,

~\Anaconda3\lib\site-packages\pandas_profiling\describe.py in describe(df, bins, check_correlation, correlation_threshold, correlation_overrides, check_recoded, pool_size, **kwargs)
    390             if name not in names:
    391                 names.append(name)
--> 392     variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    393     variable_stats.columns.names = df.columns.names
    394 

TypeError: concat() got an unexpected keyword argument 'join_axes'


# Target
dataset = data.copy()  # original data
target_name = 'target'
target = data.pop(target_name)
# Model standartization
scaler = StandardScaler()
#scaler = RobustScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)
# Synthesis valid as test for selection models
train, valid, train_target, valid_target = train_test_split(data, target, test_size=test_train_split_part, random_state=random_state)
train


valid


# list of accuracy of all model - amount of metrics_now * 2 (train & valid datasets)
num_models = 6
acc_train = []
acc_valid = []
acc_all = np.empty((len(metrics_now)*2, 0)).tolist()
acc_all

[[], [], [], [], [], []]


acc_all_pred = np.empty((len(metrics_now), 0)).tolist()
acc_all_pred

[[], [], []]


# Splitting train data for model tuning with cross-validation
cv_train = ShuffleSplit(n_splits=cv_n_split, test_size=test_train_split_part, random_state=random_state)
def acc_d(y_meas, y_pred):
    # Relative error between predicted y_pred and measured y_meas values
    return mean_absolute_error(y_meas, y_pred)*len(y_meas)/sum(abs(y_meas))

def acc_rmse(y_meas, y_pred):
    # RMSE between predicted y_pred and measured y_meas values
    return (mean_squared_error(y_meas, y_pred))**0.5
def plot_cm(train_target, train_target_pred, valid_target, valid_target_pred):    
    # Building the confusion matrices
    
    def cm_calc(y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
        cm_sum = np.sum(cm, axis=1, keepdims=True)
        cm_perc = cm / cm_sum.astype(float) * 100
        annot = np.empty_like(cm).astype(str)
        nrows, ncols = cm.shape
        for i in range(nrows):
            for j in range(ncols):
                c = cm[i, j]
                p = cm_perc[i, j]
                if i == j:
                    s = cm_sum[i]
                    annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
                elif c == 0:
                    annot[i, j] = ''
                else:
                    annot[i, j] = '%.1f%%\n%d' % (p, c)
        cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
        cm.index.name = 'Actual'
        cm.columns.name = 'Predicted'
        return cm, annot
    
    # Building the confusion matrices
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16, 6), sharex=True)
    
    # Training data
    ax = axes[0]
    ax.set_title("for training data")
    cm0, annot0 = cm_calc(train_target, train_target_pred)    
    sns.heatmap(cm0, cmap= "YlGnBu", annot=annot0, fmt='', ax=ax)
    
    # Test data
    ax = axes[1]
    ax.set_title("for test (validation) data")
    cm1, annot1 = cm_calc(valid_target, valid_target_pred)
    sns.heatmap(cm1, cmap= "YlGnBu", annot=annot1, fmt='', ax=ax)
    
    fig.suptitle('CONFUSION MATRICES')
    plt.show()
    
def acc_metrics_calc(num, acc_all, model, train, valid, train_target, valid_target):
    # The models selection stage
    # Calculation of accuracy of model by different metrics
    
    ytrain = model.predict(train).astype(int)
    yvalid = model.predict(valid).astype(int)
    print('train_target = ', train_target[:5].values)
    print('ytrain = ', ytrain[:5])
    print('valid_target =', valid_target[:5].values)
    print('yvalid =', yvalid[:5])

    num_acc = 0
    for x in metrics_now:
        if x == 1:
            #accuracy_score criterion
            acc_train = round(metrics.accuracy_score(train_target, ytrain), 2)
            acc_valid = round(metrics.accuracy_score(valid_target, yvalid), 2)
        elif x == 2:
            #rmse criterion
            acc_train = round(acc_rmse(train_target, ytrain), 2)
            acc_valid = round(acc_rmse(valid_target, yvalid), 2)
        elif x == 3:
            #relative error criterion
            acc_train = round(acc_d(train_target, ytrain) * 100, 2)
            acc_valid = round(acc_d(valid_target, yvalid) * 100, 2)
        
        print('acc of', metrics_all[x], 'for train =', acc_train)
        print('acc of', metrics_all[x], 'for valid =', acc_valid)
        acc_all[num_acc].append(acc_train) #train
        acc_all[num_acc+1].append(acc_valid) #valid
        num_acc += 2
    
    #  Building the confusion matrices
    plot_cm(train_target, ytrain, valid_target, yvalid)
    
    return acc_all


def plot_learning_curve(estimator, title, X, y, cv=None, axes=None, ylim=None, 
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), random_state=0):

    fig, axes = plt.subplots(2, 1, figsize=(20, 10))
    
    if axes is None:
        _, axes = plt.subplots(1, 2, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    cv_train = ShuffleSplit(n_splits=cv_n_split, test_size=test_train_split_part, random_state=random_state)
    
    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator=estimator, X=X, y=y, cv=cv,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    plt.show()
    return


# Decision Tree Classifier
decision_tree = DecisionTreeClassifier()
param_grid = {'min_samples_leaf': [i for i in range(2,12)]}
decision_tree_CV = GridSearchCV(decision_tree, param_grid=param_grid, cv=cv_train, verbose=False)
decision_tree_CV.fit(train, train_target)
print(decision_tree_CV.best_params_)
acc_all = acc_metrics_calc(0, acc_all, decision_tree_CV, train, valid, train_target, valid_target)

{'min_samples_leaf': 11}
train_target =  [0 1 1 0 0]
ytrain =  [0 1 0 0 0]
valid_target = [1 1 0 1 1]
yvalid = [0 0 0 0 1]
acc of acc for train = 0.82
acc of acc for valid = 0.69
acc of rmse for train = 0.43
acc of rmse for valid = 0.55
acc of re for train = 41.53
acc of re for valid = 68.29


# Building learning curve of model
plot_learning_curve(decision_tree_CV, "Decision Tree", train, train_target, cv=cv_train)


# Random Forest
# Parameters of model (param_grid) taken from the notebook 
random_forest = RandomForestClassifier()
param_grid = {'n_estimators': [50, 60, 80, 100], 'min_samples_leaf': [12, 13, 14, 15, 16, 17], 
              'max_features': ['auto'], 'max_depth': [3, 4, 5, 6], 'criterion': ['gini'], 'bootstrap': [False]}
random_forest_CV = GridSearchCV(estimator=random_forest, param_grid=param_grid, 
                             cv=cv_train, verbose=False)
random_forest_CV.fit(train, train_target)
print(random_forest_CV.best_params_)
acc_all = acc_metrics_calc(1, acc_all, random_forest_CV, train, valid, train_target, valid_target)
{'bootstrap': False, 'criterion': 'gini', 'max_depth': 6, 'max_features': 'auto', 'min_samples_leaf': 15, 'n_estimators': 50}

{'bootstrap': False, 'criterion': 'gini', 'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 12, 'n_estimators': 50}
train_target =  [0 1 1 0 0]
ytrain =  [1 1 1 0 0]
valid_target = [1 1 0 1 1]
yvalid = [1 0 0 0 1]
acc of acc for train = 0.8
acc of acc for valid = 0.74
acc of rmse for train = 0.45
acc of rmse for valid = 0.51
acc of re for train = 46.61
acc of re for valid = 58.54

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'auto',
 'min_samples_leaf': 15,
 'n_estimators': 50}


# Building learning curve of model
plot_learning_curve(random_forest, "Random Forest", train, train_target, cv=cv_train)


#%% split training set to validation set
Xtrain, Xval, Ztrain, Zval = train_test_split(train, train_target, test_size=test_train_split_part, random_state=random_state)
modelL = lgb.LGBMClassifier(n_estimators=10000, num_leaves=40)
modelL.fit(Xtrain, Ztrain, eval_set=[(Xval, Zval)], early_stopping_rounds=200, verbose=True)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-31-3bd378cb7dd8> in <module>
      1 #%% split training set to validation set
      2 Xtrain, Xval, Ztrain, Zval = train_test_split(train, train_target, test_size=test_train_split_part, random_state=random_state)
----> 3 modelL = lgb.LGBMClassifier(n_estimators=10000, num_leaves=40)
      4 modelL.fit(Xtrain, Ztrain, eval_set=[(Xval, Zval)], early_stopping_rounds=200, verbose=True)

NameError: name 'lgb' is not defined


fig =  plt.figure(figsize = (10,10))
axes = fig.add_subplot(111)
lgb.plot_importance(modelL,ax = axes,height = 0.5)
plt.show();
plt.close()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-32-4f4d2f79d599> in <module>
      1 fig =  plt.figure(figsize = (10,10))
      2 axes = fig.add_subplot(111)
----> 3 lgb.plot_importance(modelL,ax = axes,height = 0.5)
      4 plt.show();
      5 plt.close()

NameError: name 'lgb' is not defined


# Extra Trees Classifier

etr = ExtraTreesClassifier()
etr_CV = GridSearchCV(estimator=etr, param_grid={'min_samples_leaf' : [20, 25, 30]}, cv=cv_train, verbose=False)
etr_CV.fit(train, train_target)
print(etr_CV.best_params_)
acc_all = acc_metrics_calc(4, acc_all, etr_CV, train, valid, train_target, valid_target)

{'min_samples_leaf': 20}
train_target =  [0 1 1 0 0]
ytrain =  [1 1 1 0 0]
valid_target = [1 1 0 1 1]
yvalid = [1 0 0 0 1]
acc of acc for train = 0.79
acc of acc for valid = 0.71
acc of rmse for train = 0.46
acc of rmse for valid = 0.53
acc of re for train = 48.31
acc of re for valid = 63.41


# Building learning curve of model
plot_learning_curve(etr, "Extra Trees Classifier", train, train_target, cv=cv_train)


# AdaBoost Classifier

Ada_Boost = AdaBoostClassifier()
Ada_Boost_CV = GridSearchCV(estimator=Ada_Boost, param_grid={'learning_rate' : [0.09, 0.1, 0.2]}, cv=cv_train, verbose=False)
Ada_Boost_CV.fit(train, train_target)
print(Ada_Boost_CV.best_params_)
acc_all = acc_metrics_calc(5, acc_all, Ada_Boost_CV, train, valid, train_target, valid_target)

{'learning_rate': 0.2}
train_target =  [0 1 1 0 0]
ytrain =  [0 1 1 0 0]
valid_target = [1 1 0 1 1]
yvalid = [0 0 0 0 1]
acc of acc for train = 0.86
acc of acc for valid = 0.74
acc of rmse for train = 0.38
acc of rmse for valid = 0.51
acc of re for train = 33.05
acc of re for valid = 58.54


# Building learning curve of model
plot_learning_curve(Ada_Boost, "AdaBoost Classifier", train, train_target, cv=cv_train)


models = pd.DataFrame({
    'Model': ['Decision Tree Classifier', 
              'Random Forest Classifier',  
              'XGB Classifier', 
              'LGBM Classifier',
              'ExtraTrees Classifier', 
              'AdaBoost Classifier', 
              ]})
for x in metrics_now:
    xs = metrics_all[x]
    models[xs + '_train'] = acc_all[(x-1)*2]
    models[xs + '_valid'] = acc_all[(x-1)*2+1]
    if xs == "acc":
        models[xs + '_diff'] = models[xs + '_train'] - models[xs + '_valid']
        
#models
print('Prediction accuracy for models')
ms = metrics_all[metrics_now[0]] # the first from metrics
models[['Model', ms + '_train', ms + '_valid', 'acc_diff']].sort_values(by=[(ms + '_valid'), (ms + '_train')], ascending=False)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-40-cb68518a54db> in <module>
      9 for x in metrics_now:
     10     xs = metrics_all[x]
---> 11     models[xs + '_train'] = acc_all[(x-1)*2]
     12     models[xs + '_valid'] = acc_all[(x-1)*2+1]
     13     if xs == "acc":

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
   3038         else:
   3039             # set column
-> 3040             self._set_item(key, value)
   3041 
   3042     def _setitem_slice(self, key: slice, value):

~\Anaconda3\lib\site-packages\pandas\core\frame.py in _set_item(self, key, value)
   3114         """
   3115         self._ensure_valid_index(value)
-> 3116         value = self._sanitize_column(key, value)
   3117         NDFrame._set_item(self, key, value)
   3118 

~\Anaconda3\lib\site-packages\pandas\core\frame.py in _sanitize_column(self, key, value, broadcast)
   3762 
   3763             # turn me into an ndarray
-> 3764             value = sanitize_index(value, self.index)
   3765             if not isinstance(value, (np.ndarray, Index)):
   3766                 if isinstance(value, list) and len(value) > 0:

~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in sanitize_index(data, index)
    745     """
    746     if len(data) != len(index):
--> 747         raise ValueError(
    748             "Length of values "
    749             f"({len(data)}) "

ValueError: Length of values (4) does not match length of index (6)


Prediction accuracy for models
Model	acc_train	acc_valid	acc_diff
3	LGBM Classifier	0.85	0.77	0.08
1	Random Forest Classifier	0.82	0.76	0.06
2	XGB Classifier	0.88	0.74	0.14
5	AdaBoost Classifier	0.86	0.74	0.12
0	Decision Tree Classifier	0.82	0.70	0.12
4	ExtraTrees Classifier	0.77	0.69	0.08


pd.options.display.float_format = '{:,.2f}'.format
for x in metrics_now:   
    # Plot
    xs = metrics_all[x]
    xs_train = metrics_all[x] + '_train'
    xs_test = metrics_all[x] + '_valid'
    plt.figure(figsize=[15,6])
    xx = models['Model']
    plt.tick_params(labelsize=14)
    plt.plot(xx, models[xs_train], label = xs_train)
    plt.plot(xx, models[xs_test], label = xs_test)
    plt.legend()
    plt.title(str(xs) + ' criterion for ' + str(num_models) + ' popular models for train and valid datasets')
    plt.xlabel('Models')
    plt.ylabel(xs + ', %')
    plt.xticks(xx, rotation='vertical')
    plt.show()

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2894             try:
-> 2895                 return self._engine.get_loc(casted_key)
   2896             except KeyError as err:

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'acc_train'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-41-f7ad1448f695> in <module>
      8     xx = models['Model']
      9     plt.tick_params(labelsize=14)
---> 10     plt.plot(xx, models[xs_train], label = xs_train)
     11     plt.plot(xx, models[xs_test], label = xs_test)
     12     plt.legend()

~\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2900             if self.columns.nlevels > 1:
   2901                 return self._getitem_multilevel(key)
-> 2902             indexer = self.columns.get_loc(key)
   2903             if is_integer(indexer):
   2904                 indexer = [indexer]

~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2895                 return self._engine.get_loc(casted_key)
   2896             except KeyError as err:
-> 2897                 raise KeyError(key) from err
   2898 
   2899         if tolerance is not None:

KeyError: 'acc_train'


# Choose the number of metric by which the best models will be determined =>  {1: 'accuracy_score', 2 : 'relative_error', 3 : 'rmse'}
metrics_main = 1
xs = metrics_all[metrics_main]
xs_train = metrics_all[metrics_main] + '_train'
xs_test = metrics_all[metrics_main] + '_valid'
print('The best models by the', xs,'criterion:')
direct_sort = False if (metrics_main==1) else True
models.sort_values(by=[xs_test, xs_train], ascending=direct_sort)

The best models by the acc criterion:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-42-bbf501e70928> in <module>
      6 print('The best models by the',xs,'criterion:')
      7 direct_sort = False if (metrics_main==1) else True
----> 8 models.sort_values(by=[xs_test, xs_train], ascending=direct_sort)

~\Anaconda3\lib\site-packages\pandas\core\frame.py in sort_values(self, by, axis, ascending, inplace, kind, na_position, ignore_index, key)
   5278             from pandas.core.sorting import lexsort_indexer
   5279 
-> 5280             keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
   5281 
   5282             # need to rewrap columns in Series to apply key function

~\Anaconda3\lib\site-packages\pandas\core\frame.py in <listcomp>(.0)
   5278             from pandas.core.sorting import lexsort_indexer
   5279 
-> 5280             keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
   5281 
   5282             # need to rewrap columns in Series to apply key function

~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_label_or_level_values(self, key, axis)
   1561             values = self.axes[axis].get_level_values(key)._values
   1562         else:
-> 1563             raise KeyError(key)
   1564 
   1565         # Check for duplicates

KeyError: 'acc_valid'


The best models by the acc criterion:
Model acc_train	acc_valid acc_diff rmse_train rmse_valid re_train re_valid
3 LGBM Classifier 0.85 0.77	0.08 0.39 0.48 34.75 51.22
1 Random Forest Classifier 0.82	0.76 0.06 0.42 0.49	40.68 53.66
2 XGB Classifier 0.88 0.74 0.14	0.34 0.51 27.12	58.54
5 AdaBoost Classifier 0.86 0.74	0.12 0.38 0.51 33.05 58.54
0 Decision Tree Classifier 0.82	0.70 0.12 0.43 0.54 41.53 65.85
4 ExtraTrees Classifier	0.77 0.69 0.08 0.47	0.55 51.69 68.29


# Selection the best models
models_best = models[(models.acc_diff < 0.1) & (models.acc_valid > 0.7)]
if len(models_best)>0:
    print('The best models:')
    display(models_best[['Model', 'acc_train', 'acc_valid']].sort_values(by=['acc_valid'], ascending=False))
    # Selection the best models from the best
    models_best_best = models_best[(models_best.acc_valid > 0.9)]
    if len(models_best_best)>0:
        print('Optimal model:')
        display(models_best_best[['Model', 'acc_train', 'acc_valid']].sort_values(by=['acc_valid'], ascending=False))
    else: print('But no model provides good accuracy at least above 0.9')
else:
    print('There are no good models - either they have not learned enough, or they have overfit!')

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-43-fe2ed5146316> in <module>
      1 # Selection the best models
----> 2 models_best = models[(models.acc_diff < 0.1) & (models.acc_valid > 0.7)]
      3 if len(models_best)>0:
      4     print('The best models:')
      5     display(models_best[['Model', 'acc_train', 'acc_valid']].sort_values(by=['acc_valid'], ascending=False))

~\Anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   5137             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5138                 return self[name]
-> 5139             return object.__getattribute__(self, name)
   5140 
   5141     def __setattr__(self, name: str, value) -> None:

AttributeError: 'DataFrame' object has no attribute 'acc_diff'


The best models:
Model acc_train acc_valid
3 LGBM Classifier 0.85 0.77
1 Random Forest Classifier 0.82	0.76
But no model provides good accuracy at least above 0.9

	age	sex	cp	trestbps	chol	fbs	restecg	thalch	exang	oldpeak	slope	target
0	63	Male	typical angina	145.0	233.0	True	lv hypertrophy	150.0	False	2.3	downsloping	0
1	67	Male	asymptomatic	160.0	286.0	False	lv hypertrophy	108.0	True	1.5	flat	2
2	67	Male	asymptomatic	120.0	229.0	False	lv hypertrophy	129.0	True	2.6	flat	1

	age	trestbps	chol	thalch	oldpeak	target
count	520.000000	520.000000	520.000000	520.000000	520.000000	520.000000
mean	54.780769	133.365385	215.436538	138.744231	1.183269	1.121154
std	8.873442	18.971664	95.672469	25.792375	1.031624	1.175675
min	29.000000	0.000000	0.000000	60.000000	0.000000	0.000000
25%	48.750000	120.000000	197.000000	120.000000	0.100000	0.000000
50%	56.000000	130.000000	233.000000	140.000000	1.000000	1.000000
75%	61.000000	142.000000	271.250000	159.000000	2.000000	2.000000
max	77.000000	200.000000	417.000000	202.000000	4.000000	4.000000

	age	sex	cp	trestbps	chol	fbs	restecg	thalch	exang	oldpeak	slope	age2	trestbps2	chol2	thalch2	oldpeak2	sex_cp	sex_trestbps2	sex_chol2	sex_thalch2	sex_oldpeak2	sex_slope	age2_cp	age2_trestbps2	age2_chol2	age2_thalch2	age2_oldpeak2	age2_slope	fbs_cp	fbs_trestbps2	fbs_chol2	fbs_thalch2	fbs_oldpeak2	fbs_slope	restecg_cp	restecg_trestbps2	restecg_chol2	restecg_thalch2	restecg_oldpeak2	restecg_slope	exang_cp	exang_trestbps2	exang_chol2	exang_thalch2	exang_oldpeak2	exang_slope
60	-0.915594	0.656376	-0.895014	-1.585324	-0.282600	-0.390981	-1.359306	0.179306	1.175406	2.247471	-0.621450	-0.928537	-1.645960	-0.30612	-0.203316	2.467774	0.155053	-0.026269	0.537525	0.51625	1.805162	0.384719	-1.137910	-1.196558	-1.057124	-1.012697	-0.448651	-1.038888	-0.803064	-1.029186	-0.492666	-0.444420	0.906560	-0.675558	-1.726763	-1.711174	-1.503261	-1.512444	-0.275955	-1.553832	0.808758	0.475850	1.014943	1.236184	1.859082	1.007103
195	0.904282	0.656376	-0.895014	0.446735	-0.295341	-0.390981	-1.359306	-0.216479	1.175406	1.021018	1.122465	1.254128	0.536595	-0.30612	-0.203316	0.969729	0.155053	0.766166	0.537525	0.51625	1.003032	1.088095	0.977230	1.464378	1.137329	1.245597	1.531979	1.461151	-0.803064	-0.160735	-0.492666	-0.444420	0.156732	0.220237	-1.726763	-1.061861	-1.503261	-1.512444	-0.828882	-0.997465	0.808758	1.188394	1.014943	1.236184	1.310451	1.734790
302	-0.688109	0.656376	-0.895014	1.575657	0.774941	-0.390981	1.988101	0.575092	1.175406	2.247471	-0.621450	-0.928537	1.627873	0.49689	-0.203316	2.467774	0.155053	1.162383	0.849859	0.51625	1.805162	0.384719	-1.137910	-0.502401	-0.888320	-1.012697	-0.448651	-1.038888	-0.803064	0.273490	-0.076943	-0.444420	0.906560	-0.675558	1.607041	2.347035	2.009200	1.708172	2.488684	1.784370	0.808758	1.544666	1.292652	1.236184	1.859082	1.007103
132	-0.005656	0.656376	1.104956	-0.117726	0.201576	2.557668	-1.359306	1.168770	-0.850770	-1.097402	1.122465	0.162795	-0.009044	0.49689	1.239831	-1.027664	1.157358	0.568057	0.849859	1.19336	-0.066475	1.088095	0.448445	-0.039630	0.293309	0.492832	-0.184567	0.389706	2.577277	2.227504	2.417393	2.865866	1.656388	2.907623	-0.893312	-1.224189	-1.252371	-1.052356	-1.566119	-0.997465	-0.326011	-0.771103	-0.651312	-0.465332	-1.066951	-0.448271
220	-1.029336	0.656376	0.104971	-0.682187	-0.588395	-0.390981	0.314397	-0.058165	-0.850770	0.017556	-0.621450	-0.928537	-0.554683	-0.30612	-0.203316	-0.028968	0.656205	0.369949	0.537525	0.51625	0.468279	0.384719	-0.873518	-0.965173	-1.057124	-1.012697	-0.844777	-1.038888	-0.239674	-0.594961	-0.492666	-0.444420	-0.343153	-0.675558	0.356864	0.074438	0.252969	0.327908	0.276973	0.115269	-0.893395	-0.949239	-0.929022	-1.032503	-0.701197	-1.175958
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
71	-1.029336	-1.523516	1.104956	-1.359540	-1.136278	-0.390981	0.314397	1.247927	-0.850770	-0.428427	-0.621450	-0.928537	-1.645960	-1.10913	1.239831	-0.528316	-0.847252	-2.007354	-1.961147	-0.83797	-1.403358	-1.725407	-0.609125	-1.196558	-1.225928	-0.636315	-0.976819	-1.038888	0.323716	-1.029186	-0.908389	0.217638	-0.593096	-0.675558	0.773590	-0.250219	0.002079	0.787996	0.092664	0.115269	-0.326011	-1.305511	-1.206731	-0.465332	-0.884074	-1.175958
106	-0.688109	0.656376	1.104956	-1.359540	0.163351	-0.390981	0.314397	0.337620	-0.850770	-1.097402	1.122465	-0.928537	-1.645960	0.49689	-0.203316	-1.027664	1.157358	-0.026269	0.849859	0.51625	-0.066475	1.088095	-0.609125	-1.196558	-0.888320	-1.012697	-1.108861	-0.681739	0.323716	-1.029186	-0.076943	-0.444420	-0.843039	0.220237	0.773590	-0.250219	0.503859	0.327908	-0.091645	0.671636	-0.326011	-1.305511	-0.651312	-1.032503	-1.066951	-0.448271
270	-0.346882	0.656376	-0.895014	-0.117726	0.035937	-0.390981	0.314397	-0.889315	1.175406	1.132514	-0.621450	0.162795	-0.009044	-0.30612	-0.203316	0.969729	0.155053	0.568057	0.537525	0.51625	1.003032	0.384719	-0.080340	-0.039630	0.124505	0.116450	0.343601	0.032558	-0.803064	-0.377848	-0.492666	-0.444420	0.156732	-0.675558	-0.059861	0.236766	0.252969	0.327908	0.645591	0.115269	0.808758	1.010258	1.014943	1.236184	1.310451	1.007103
348	0.108087	-1.523516	-0.895014	0.333843	0.558336	-0.390981	0.314397	-1.522571	1.175406	0.575035	-0.621450	0.162795	-0.009044	0.49689	-1.646464	0.470381	-1.849557	-1.413028	-1.336479	-2.19219	-0.868605	-1.725407	-0.080340	-0.039630	0.293309	-0.259933	0.211559	0.032558	-0.803064	-0.377848	-0.076943	-1.106477	-0.093211	-0.675558	-0.059861	0.236766	0.503859	-0.132180	0.461282	0.115269	0.808758	1.010258	1.292652	0.669012	1.127574	1.007103
102	0.676798	0.656376	2.104940	2.140118	0.736717	-0.390981	-1.359306	0.614670	-0.850770	-0.874410	-0.621450	0.162795	2.173512	0.49689	-0.203316	-1.027664	1.658510	1.360491	0.849859	0.51625	-0.066475	0.384719	0.712837	0.423142	0.293309	0.116450	-0.184567	0.032558	0.887106	0.490603	-0.076943	-0.444420	-0.843039	-0.675558	-0.476587	-0.574876	-1.252371	-1.512444	-1.566119	-1.553832	0.241373	-0.058559	-0.651312	-1.032503	-1.066951	-1.175958

	age	sex	cp	trestbps	chol	fbs	restecg	thalch	exang	oldpeak	slope	age2	trestbps2	chol2	thalch2	oldpeak2	sex_cp	sex_trestbps2	sex_chol2	sex_thalch2	sex_oldpeak2	sex_slope	age2_cp	age2_trestbps2	age2_chol2	age2_thalch2	age2_oldpeak2	age2_slope	fbs_cp	fbs_trestbps2	fbs_chol2	fbs_thalch2	fbs_oldpeak2	fbs_slope	restecg_cp	restecg_trestbps2	restecg_chol2	restecg_thalch2	restecg_oldpeak2	restecg_slope	exang_cp	exang_trestbps2	exang_chol2	exang_thalch2	exang_oldpeak2	exang_slope
163	1.131767	-1.523516	-0.895014	-1.359540	0.494629	-0.390981	0.314397	1.010455	1.175406	0.909522	-0.621450	1.254128	-1.645960	0.496890	1.239831	0.969729	-1.849557	-2.007354	-1.336479	-0.83797	-0.601228	-1.725407	0.977230	1.001606	1.306133	1.621979	1.531979	1.104003	-0.803064	-1.029186	-0.076943	0.217638	0.156732	-0.675558	-0.059861	-0.250219	0.503859	0.787996	0.645591	0.115269	0.808758	0.475850	1.292652	1.803355	1.310451	1.007103
33	0.904282	-1.523516	-0.895014	-0.117726	1.271858	-0.390981	-1.359306	1.010455	-0.850770	-1.097402	1.122465	1.254128	-0.009044	1.299901	1.239831	-1.027664	-1.849557	-1.413028	-1.024145	-0.83797	-1.670735	-1.022032	0.977230	1.348685	1.474937	1.621979	1.003811	1.461151	-0.803064	-0.377848	0.338780	0.217638	-0.843039	0.220237	-1.726763	-1.224189	-1.001481	-1.052356	-1.566119	-0.997465	-1.460779	-0.771103	-0.373603	-0.465332	-1.066951	-0.448271
15	-0.460625	0.656376	0.104971	-0.117726	0.456405	-0.390981	0.314397	1.089613	-0.850770	-0.428427	1.122465	-0.928537	-0.009044	0.496890	1.239831	-0.528316	0.656205	0.568057	0.849859	1.19336	0.200902	1.088095	-0.873518	-0.849480	-0.888320	-0.636315	-0.976819	-0.681739	-0.239674	-0.377848	-0.076943	0.217638	-0.593096	0.220237	0.356864	0.236766	0.503859	0.787996	0.092664	0.671636	-0.893395	-0.771103	-0.651312	-0.465332	-0.884074	-0.448271
322	2.269190	-1.523516	1.104956	1.575657	-2.932824	-0.390981	1.988101	-0.889315	-0.850770	-1.097402	1.122465	2.345460	1.627873	-2.715151	-0.203316	-1.027664	-0.847252	-0.818703	-2.273481	-1.51508	-1.670735	-1.022032	2.563585	2.736999	1.812545	2.374744	2.060147	2.532597	0.323716	0.273490	-1.739834	-0.444420	-0.843039	0.220237	2.440492	2.347035	1.256530	1.708172	1.382828	2.340737	-0.326011	-0.236695	-1.484440	-1.032503	-1.066951	-0.448271
57	0.790540	0.656376	-0.895014	-0.399956	0.354473	-0.390981	-1.359306	-0.097744	1.175406	2.024480	-0.621450	1.254128	-0.554683	0.496890	-0.203316	1.968426	0.155053	0.369949	0.849859	0.51625	1.537785	0.384719	0.977230	1.232992	1.306133	1.245597	1.796063	1.104003	-0.803064	-0.594961	-0.076943	-0.444420	0.656617	-0.675558	-1.726763	-1.386518	-1.252371	-1.512444	-0.460264	-1.553832	0.808758	0.832122	1.292652	1.236184	1.676205	1.007103
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
316	0.676798	0.656376	-0.895014	0.164504	-2.932824	-0.390981	0.314397	-1.126786	1.175406	0.017556	-0.621450	0.162795	-0.009044	-2.715151	-1.646464	-0.028968	0.155053	0.568057	-0.399477	-0.16086	0.468279	0.384719	-0.080340	-0.039630	-0.381908	-0.259933	0.079517	0.032558	-0.803064	-0.377848	-1.739834	-1.106477	-0.343153	-0.675558	-0.059861	0.236766	-0.499701	-0.132180	0.276973	0.115269	0.808758	1.010258	0.181815	0.669012	0.944697	1.007103
310	-1.711790	-1.523516	-0.895014	-1.246648	-2.932824	-0.390981	0.314397	0.495934	-0.850770	-1.097402	-0.621450	-2.019869	-1.100321	-2.715151	-0.203316	-1.027664	-1.849557	-1.809246	-2.273481	-1.51508	-1.670735	-1.725407	-2.195480	-1.775023	-2.407556	-2.141844	-2.165197	-2.110333	-0.803064	-0.812073	-1.739834	-0.444420	-0.843039	-0.675558	-0.059861	-0.087891	-0.499701	0.327908	-0.091645	0.115269	-1.460779	-1.127375	-1.484440	-1.032503	-1.066951	-1.175958
339	0.563056	0.656376	-0.895014	-0.682187	-2.932824	-0.390981	-1.359306	-1.482993	1.175406	0.575035	-2.365366	0.162795	-0.554683	-2.715151	-1.646464	0.470381	0.155053	0.369949	-0.399477	-0.16086	0.735655	-0.318656	-0.080340	-0.155322	-0.381908	-0.259933	0.211559	-0.324591	-0.803064	-0.594961	-1.739834	-1.106477	-0.093211	-1.571354	-1.726763	-1.386518	-2.005042	-1.972531	-1.013192	-2.110199	0.808758	0.832122	0.181815	0.669012	1.127574	0.279416
110	-0.119398	0.656376	2.104940	1.124088	0.864132	2.557668	0.314397	1.366662	-0.850770	0.240548	-0.621450	0.162795	1.082234	0.496890	1.239831	-0.028968	1.658510	0.964274	0.849859	1.19336	0.468279	0.384719	0.712837	0.191756	0.293309	0.492832	0.079517	0.032558	3.140667	2.661730	2.417393	2.865866	2.156274	2.011828	1.190316	0.561423	0.503859	0.787996	0.276973	0.115269	0.241373	-0.414831	-0.651312	-0.465332	-0.701197	-1.175958
295	0.335571	0.656376	-0.895014	1.011196	-0.002288	-0.390981	1.988101	-0.770579	1.175406	0.575035	-0.621450	0.162795	1.082234	-0.306120	-0.203316	0.470381	0.155053	0.964274	0.537525	0.51625	0.735655	0.384719	-0.080340	0.191756	0.124505	0.116450	0.211559	0.032558	-0.803064	0.056377	-0.492666	-0.444420	-0.093211	-0.675558	1.607041	2.184707	1.758310	1.708172	1.935756	1.784370	0.808758	1.366530	1.014943	1.236184	1.127574	1.007103

Heart Disease Data¶

Instructions:¶