import numpy as np
import pandas as pd
data=pd.read_csv('C:/My Files/Python Practice/Assignment ML DL -3.csv')
data.head()
id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 2 |
2 | 3 | 67 | Male | Cleveland | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | Cleveland | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | Cleveland | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
data['num']=np.where(data['num']>0,1,0)
data.head()
id | age | sex | dataset | cp | trestbps | chol | fbs | restecg | thalch | exang | oldpeak | slope | ca | thal | num | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | Male | Cleveland | typical angina | 145.0 | 233.0 | True | lv hypertrophy | 150.0 | False | 2.3 | downsloping | 0.0 | fixed defect | 0 |
1 | 2 | 67 | Male | Cleveland | asymptomatic | 160.0 | 286.0 | False | lv hypertrophy | 108.0 | True | 1.5 | flat | 3.0 | normal | 1 |
2 | 3 | 67 | Male | Cleveland | asymptomatic | 120.0 | 229.0 | False | lv hypertrophy | 129.0 | True | 2.6 | flat | 2.0 | reversable defect | 1 |
3 | 4 | 37 | Male | Cleveland | non-anginal | 130.0 | 250.0 | False | normal | 187.0 | False | 3.5 | downsloping | 0.0 | normal | 0 |
4 | 5 | 41 | Female | Cleveland | atypical angina | 130.0 | 204.0 | False | lv hypertrophy | 172.0 | False | 1.4 | upsloping | 0.0 | normal | 0 |
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import tree
from sklearn.metrics import accuracy_score,confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
data=data[['ca','age','chol','num']].dropna()
X = data[['ca','age','chol']]
y = data['num']
print(X.shape)
print(y.shape)
(308, 3) (308,)
X.dropna()
ca | age | chol | |
---|---|---|---|
0 | 0.0 | 63 | 233.0 |
1 | 3.0 | 67 | 286.0 |
2 | 2.0 | 67 | 229.0 |
3 | 0.0 | 37 | 250.0 |
4 | 0.0 | 41 | 204.0 |
... | ... | ... | ... |
676 | 1.0 | 60 | 0.0 |
691 | 2.0 | 62 | 0.0 |
717 | 2.0 | 72 | 0.0 |
748 | 0.0 | 56 | 100.0 |
759 | 0.0 | 59 | 0.0 |
308 rows × 3 columns
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(x_train.shape)
print(x_test.shape)
(231, 3) (77, 3)
clf = tree.DecisionTreeClassifier(random_state=0)
clf.fit(x_train,y_train)
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)
y_train_pred
array([0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0])
clf.predict_proba(x_test)
array([[1., 0.], [1., 0.], [0., 1.], [0., 1.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [0., 1.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [0., 1.], [0., 1.], [1., 0.], [1., 0.], [1., 0.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [1., 0.], [1., 0.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [0., 1.], [0., 1.], [1., 0.], [1., 0.], [1., 0.], [0., 1.], [1., 0.], [1., 0.], [0., 1.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [1., 0.], [0., 1.], [0., 1.], [0., 1.], [1., 0.], [1., 0.], [1., 0.], [0., 1.], [1., 0.], [1., 0.], [0., 1.], [0., 1.], [0., 1.], [0., 1.], [1., 0.], [1., 0.], [1., 0.], [0., 1.], [0., 1.], [1., 0.], [0., 1.], [1., 0.], [0., 1.], [1., 0.], [1., 0.], [1., 0.], [0., 1.], [0., 1.]])
plt.figure(figsize=(20,20))
features = X.columns
classes = ['Not heart disease','heart disease']
tree.plot_tree(clf,feature_names=features,class_names=classes,filled=True)
plt.show()
help(tree.DecisionTreeClassifier)
Help on class DecisionTreeClassifier in module sklearn.tree._classes: class DecisionTreeClassifier(sklearn.base.ClassifierMixin, BaseDecisionTree) | DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0) | | A decision tree classifier. | | Read more in the :ref:`User Guide <tree>`. | | Parameters | ---------- | criterion : {"gini", "entropy"}, default="gini" | The function to measure the quality of a split. Supported criteria are | "gini" for the Gini impurity and "entropy" for the information gain. | | splitter : {"best", "random"}, default="best" | The strategy used to choose the split at each node. Supported | strategies are "best" to choose the best split and "random" to choose | the best random split. | | max_depth : int, default=None | The maximum depth of the tree. If None, then nodes are expanded until | all leaves are pure or until all leaves contain less than | min_samples_split samples. | | min_samples_split : int or float, default=2 | The minimum number of samples required to split an internal node: | | - If int, then consider `min_samples_split` as the minimum number. | - If float, then `min_samples_split` is a fraction and | `ceil(min_samples_split * n_samples)` are the minimum | number of samples for each split. | | .. versionchanged:: 0.18 | Added float values for fractions. | | min_samples_leaf : int or float, default=1 | The minimum number of samples required to be at a leaf node. | A split point at any depth will only be considered if it leaves at | least ``min_samples_leaf`` training samples in each of the left and | right branches. This may have the effect of smoothing the model, | especially in regression. | | - If int, then consider `min_samples_leaf` as the minimum number. | - If float, then `min_samples_leaf` is a fraction and | `ceil(min_samples_leaf * n_samples)` are the minimum | number of samples for each node. | | .. versionchanged:: 0.18 | Added float values for fractions. | | min_weight_fraction_leaf : float, default=0.0 | The minimum weighted fraction of the sum total of weights (of all | the input samples) required to be at a leaf node. Samples have | equal weight when sample_weight is not provided. | | max_features : int, float or {"auto", "sqrt", "log2"}, default=None | The number of features to consider when looking for the best split: | | - If int, then consider `max_features` features at each split. | - If float, then `max_features` is a fraction and | `int(max_features * n_features)` features are considered at each | split. | - If "auto", then `max_features=sqrt(n_features)`. | - If "sqrt", then `max_features=sqrt(n_features)`. | - If "log2", then `max_features=log2(n_features)`. | - If None, then `max_features=n_features`. | | Note: the search for a split does not stop until at least one | valid partition of the node samples is found, even if it requires to | effectively inspect more than ``max_features`` features. | | random_state : int, RandomState instance or None, default=None | Controls the randomness of the estimator. The features are always | randomly permuted at each split, even if ``splitter`` is set to | ``"best"``. When ``max_features < n_features``, the algorithm will | select ``max_features`` at random at each split before finding the best | split among them. But the best found split may vary across different | runs, even if ``max_features=n_features``. That is the case, if the | improvement of the criterion is identical for several splits and one | split has to be selected at random. To obtain a deterministic behaviour | during fitting, ``random_state`` has to be fixed to an integer. | See :term:`Glossary <random_state>` for details. | | max_leaf_nodes : int, default=None | Grow a tree with ``max_leaf_nodes`` in best-first fashion. | Best nodes are defined as relative reduction in impurity. | If None then unlimited number of leaf nodes. | | min_impurity_decrease : float, default=0.0 | A node will be split if this split induces a decrease of the impurity | greater than or equal to this value. | | The weighted impurity decrease equation is the following:: | | N_t / N * (impurity - N_t_R / N_t * right_impurity | - N_t_L / N_t * left_impurity) | | where ``N`` is the total number of samples, ``N_t`` is the number of | samples at the current node, ``N_t_L`` is the number of samples in the | left child, and ``N_t_R`` is the number of samples in the right child. | | ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, | if ``sample_weight`` is passed. | | .. versionadded:: 0.19 | | class_weight : dict, list of dict or "balanced", default=None | Weights associated with classes in the form ``{class_label: weight}``. | If None, all classes are supposed to have weight one. For | multi-output problems, a list of dicts can be provided in the same | order as the columns of y. | | Note that for multioutput (including multilabel) weights should be | defined for each class of every column in its own dict. For example, | for four-class multilabel classification weights should be | [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of | [{1:1}, {2:5}, {3:1}, {4:1}]. | | The "balanced" mode uses the values of y to automatically adjust | weights inversely proportional to class frequencies in the input data | as ``n_samples / (n_classes * np.bincount(y))`` | | For multi-output, the weights of each column of y will be multiplied. | | Note that these weights will be multiplied with sample_weight (passed | through the fit method) if sample_weight is specified. | | ccp_alpha : non-negative float, default=0.0 | Complexity parameter used for Minimal Cost-Complexity Pruning. The | subtree with the largest cost complexity that is smaller than | ``ccp_alpha`` will be chosen. By default, no pruning is performed. See | :ref:`minimal_cost_complexity_pruning` for details. | | .. versionadded:: 0.22 | | Attributes | ---------- | classes_ : ndarray of shape (n_classes,) or list of ndarray | The classes labels (single output problem), | or a list of arrays of class labels (multi-output problem). | | feature_importances_ : ndarray of shape (n_features,) | The impurity-based feature importances. | The higher, the more important the feature. | The importance of a feature is computed as the (normalized) | total reduction of the criterion brought by that feature. It is also | known as the Gini importance [4]_. | | Warning: impurity-based feature importances can be misleading for | high cardinality features (many unique values). See | :func:`sklearn.inspection.permutation_importance` as an alternative. | | max_features_ : int | The inferred value of max_features. | | n_classes_ : int or list of int | The number of classes (for single output problems), | or a list containing the number of classes for each | output (for multi-output problems). | | n_features_ : int | The number of features when ``fit`` is performed. | | .. deprecated:: 1.0 | `n_features_` is deprecated in 1.0 and will be removed in | 1.2. Use `n_features_in_` instead. | | n_features_in_ : int | Number of features seen during :term:`fit`. | | .. versionadded:: 0.24 | | feature_names_in_ : ndarray of shape (`n_features_in_`,) | Names of features seen during :term:`fit`. Defined only when `X` | has feature names that are all strings. | | .. versionadded:: 1.0 | | n_outputs_ : int | The number of outputs when ``fit`` is performed. | | tree_ : Tree instance | The underlying Tree object. Please refer to | ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and | :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` | for basic usage of these attributes. | | See Also | -------- | DecisionTreeRegressor : A decision tree regressor. | | Notes | ----- | The default values for the parameters controlling the size of the trees | (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and | unpruned trees which can potentially be very large on some data sets. To | reduce memory consumption, the complexity and size of the trees should be | controlled by setting those parameter values. | | The :meth:`predict` method operates using the :func:`numpy.argmax` | function on the outputs of :meth:`predict_proba`. This means that in | case the highest predicted probabilities are tied, the classifier will | predict the tied class with the lowest index in :term:`classes_`. | | References | ---------- | | .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning | | .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification | and Regression Trees", Wadsworth, Belmont, CA, 1984. | | .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical | Learning", Springer, 2009. | | .. [4] L. Breiman, and A. Cutler, "Random Forests", | https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm | | Examples | -------- | >>> from sklearn.datasets import load_iris | >>> from sklearn.model_selection import cross_val_score | >>> from sklearn.tree import DecisionTreeClassifier | >>> clf = DecisionTreeClassifier(random_state=0) | >>> iris = load_iris() | >>> cross_val_score(clf, iris.data, iris.target, cv=10) | ... # doctest: +SKIP | ... | array([ 1. , 0.93..., 0.86..., 0.93..., 0.93..., | 0.93..., 0.93..., 1. , 0.93..., 1. ]) | | Method resolution order: | DecisionTreeClassifier | sklearn.base.ClassifierMixin | BaseDecisionTree | sklearn.base.MultiOutputMixin | sklearn.base.BaseEstimator | builtins.object | | Methods defined here: | | __init__(self, *, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0) | Initialize self. See help(type(self)) for accurate signature. | | fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted='deprecated') | Build a decision tree classifier from the training set (X, y). | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The training input samples. Internally, it will be converted to | ``dtype=np.float32`` and if a sparse matrix is provided | to a sparse ``csc_matrix``. | | y : array-like of shape (n_samples,) or (n_samples, n_outputs) | The target values (class labels) as integers or strings. | | sample_weight : array-like of shape (n_samples,), default=None | Sample weights. If None, then samples are equally weighted. Splits | that would create child nodes with net zero or negative weight are | ignored while searching for a split in each node. Splits are also | ignored if they would result in any single class carrying a | negative weight in either child node. | | check_input : bool, default=True | Allow to bypass several input checking. | Don't use this parameter unless you know what you do. | | X_idx_sorted : deprecated, default="deprecated" | This parameter is deprecated and has no effect. | It will be removed in 1.1 (renaming of 0.26). | | .. deprecated:: 0.24 | | Returns | ------- | self : DecisionTreeClassifier | Fitted estimator. | | predict_log_proba(self, X) | Predict class log-probabilities of the input samples X. | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, it will be converted to | ``dtype=np.float32`` and if a sparse matrix is provided | to a sparse ``csr_matrix``. | | Returns | ------- | proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1 | The class log-probabilities of the input samples. The order of the | classes corresponds to that in the attribute :term:`classes_`. | | predict_proba(self, X, check_input=True) | Predict class probabilities of the input samples X. | | The predicted class probability is the fraction of samples of the same | class in a leaf. | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, it will be converted to | ``dtype=np.float32`` and if a sparse matrix is provided | to a sparse ``csr_matrix``. | | check_input : bool, default=True | Allow to bypass several input checking. | Don't use this parameter unless you know what you do. | | Returns | ------- | proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1 | The class probabilities of the input samples. The order of the | classes corresponds to that in the attribute :term:`classes_`. | | ---------------------------------------------------------------------- | Readonly properties defined here: | | n_features_ | DEPRECATED: The attribute `n_features_` is deprecated in 1.0 and will be removed in 1.2. Use `n_features_in_` instead. | | ---------------------------------------------------------------------- | Data and other attributes defined here: | | __abstractmethods__ = frozenset() | | ---------------------------------------------------------------------- | Methods inherited from sklearn.base.ClassifierMixin: | | score(self, X, y, sample_weight=None) | Return the mean accuracy on the given test data and labels. | | In multi-label classification, this is the subset accuracy | which is a harsh metric since you require for each sample that | each label set be correctly predicted. | | Parameters | ---------- | X : array-like of shape (n_samples, n_features) | Test samples. | | y : array-like of shape (n_samples,) or (n_samples, n_outputs) | True labels for `X`. | | sample_weight : array-like of shape (n_samples,), default=None | Sample weights. | | Returns | ------- | score : float | Mean accuracy of ``self.predict(X)`` wrt. `y`. | | ---------------------------------------------------------------------- | Data descriptors inherited from sklearn.base.ClassifierMixin: | | __dict__ | dictionary for instance variables (if defined) | | __weakref__ | list of weak references to the object (if defined) | | ---------------------------------------------------------------------- | Methods inherited from BaseDecisionTree: | | apply(self, X, check_input=True) | Return the index of the leaf that each sample is predicted as. | | .. versionadded:: 0.17 | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, it will be converted to | ``dtype=np.float32`` and if a sparse matrix is provided | to a sparse ``csr_matrix``. | | check_input : bool, default=True | Allow to bypass several input checking. | Don't use this parameter unless you know what you do. | | Returns | ------- | X_leaves : array-like of shape (n_samples,) | For each datapoint x in X, return the index of the leaf x | ends up in. Leaves are numbered within | ``[0; self.tree_.node_count)``, possibly with gaps in the | numbering. | | cost_complexity_pruning_path(self, X, y, sample_weight=None) | Compute the pruning path during Minimal Cost-Complexity Pruning. | | See :ref:`minimal_cost_complexity_pruning` for details on the pruning | process. | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The training input samples. Internally, it will be converted to | ``dtype=np.float32`` and if a sparse matrix is provided | to a sparse ``csc_matrix``. | | y : array-like of shape (n_samples,) or (n_samples, n_outputs) | The target values (class labels) as integers or strings. | | sample_weight : array-like of shape (n_samples,), default=None | Sample weights. If None, then samples are equally weighted. Splits | that would create child nodes with net zero or negative weight are | ignored while searching for a split in each node. Splits are also | ignored if they would result in any single class carrying a | negative weight in either child node. | | Returns | ------- | ccp_path : :class:`~sklearn.utils.Bunch` | Dictionary-like object, with the following attributes. | | ccp_alphas : ndarray | Effective alphas of subtree during pruning. | | impurities : ndarray | Sum of the impurities of the subtree leaves for the | corresponding alpha value in ``ccp_alphas``. | | decision_path(self, X, check_input=True) | Return the decision path in the tree. | | .. versionadded:: 0.18 | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, it will be converted to | ``dtype=np.float32`` and if a sparse matrix is provided | to a sparse ``csr_matrix``. | | check_input : bool, default=True | Allow to bypass several input checking. | Don't use this parameter unless you know what you do. | | Returns | ------- | indicator : sparse matrix of shape (n_samples, n_nodes) | Return a node indicator CSR matrix where non zero elements | indicates that the samples goes through the nodes. | | get_depth(self) | Return the depth of the decision tree. | | The depth of a tree is the maximum distance between the root | and any leaf. | | Returns | ------- | self.tree_.max_depth : int | The maximum depth of the tree. | | get_n_leaves(self) | Return the number of leaves of the decision tree. | | Returns | ------- | self.tree_.n_leaves : int | Number of leaves. | | predict(self, X, check_input=True) | Predict class or regression value for X. | | For a classification model, the predicted class for each sample in X is | returned. For a regression model, the predicted value based on X is | returned. | | Parameters | ---------- | X : {array-like, sparse matrix} of shape (n_samples, n_features) | The input samples. Internally, it will be converted to | ``dtype=np.float32`` and if a sparse matrix is provided | to a sparse ``csr_matrix``. | | check_input : bool, default=True | Allow to bypass several input checking. | Don't use this parameter unless you know what you do. | | Returns | ------- | y : array-like of shape (n_samples,) or (n_samples, n_outputs) | The predicted classes, or the predict values. | | ---------------------------------------------------------------------- | Readonly properties inherited from BaseDecisionTree: | | feature_importances_ | Return the feature importances. | | The importance of a feature is computed as the (normalized) total | reduction of the criterion brought by that feature. | It is also known as the Gini importance. | | Warning: impurity-based feature importances can be misleading for | high cardinality features (many unique values). See | :func:`sklearn.inspection.permutation_importance` as an alternative. | | Returns | ------- | feature_importances_ : ndarray of shape (n_features,) | Normalized total reduction of criteria by feature | (Gini importance). | | ---------------------------------------------------------------------- | Methods inherited from sklearn.base.BaseEstimator: | | __getstate__(self) | | __repr__(self, N_CHAR_MAX=700) | Return repr(self). | | __setstate__(self, state) | | get_params(self, deep=True) | Get parameters for this estimator. | | Parameters | ---------- | deep : bool, default=True | If True, will return the parameters for this estimator and | contained subobjects that are estimators. | | Returns | ------- | params : dict | Parameter names mapped to their values. | | set_params(self, **params) | Set the parameters of this estimator. | | The method works on simple estimators as well as on nested objects | (such as :class:`~sklearn.pipeline.Pipeline`). The latter have | parameters of the form ``<component>__<parameter>`` so that it's | possible to update each component of a nested object. | | Parameters | ---------- | **params : dict | Estimator parameters. | | Returns | ------- | self : estimator instance | Estimator instance.
clf = tree.DecisionTreeClassifier(random_state=0,max_depth=10, min_samples_split=10)
clf.fit(x_train,y_train)
y_train_pred = clf.predict(x_train)
y_test_pred = clf.predict(x_test)
plt.figure(figsize=(20,20))
features = X.columns
classes = ['Not heart disease','heart disease']
tree.plot_tree(clf,feature_names=features,class_names=classes,filled=True)
plt.show()
def plot_confusionmatrix(y_train_pred,y_train,dom):
print(f'{dom} Confusion matrix')
cf = confusion_matrix(y_train_pred,y_train)
sns.heatmap(cf,annot=True,yticklabels=classes
,xticklabels=classes,cmap='Blues', fmt='g')
plt.tight_layout()
plt.show()
print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')
plot_confusionmatrix(y_train_pred,y_train,dom='Train')
plot_confusionmatrix(y_test_pred,y_test,dom='Test')
Train score 0.8571428571428571 Test score 0.6753246753246753 Train Confusion matrix
Test Confusion matrix
path = clf.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(ccp_alphas)
[0. 0.0008658 0.00109068 0.00229334 0.002886 0.00310435 0.00395108 0.00488234 0.00580653 0.00673401 0.00682753 0.00809043 0.00944664 0.01154917 0.01198391 0.01319316 0.014829 0.10148728]
clfs = []
for ccp_alpha in ccp_alphas:
clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
clf.fit(x_train, y_train)
clfs.append(clf)
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.show()
train_acc = []
test_acc = []
for c in clfs:
y_train_pred = c.predict(x_train)
y_test_pred = c.predict(x_test)
train_acc.append(accuracy_score(y_train_pred,y_train))
test_acc.append(accuracy_score(y_test_pred,y_test))
plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()
clf_ = tree.DecisionTreeClassifier(random_state=0,ccp_alpha=0.006)
clf_.fit(x_train,y_train)
y_train_pred = clf_.predict(x_train)
y_test_pred = clf_.predict(x_test)
print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')
plot_confusionmatrix(y_train_pred,y_train,dom='Train')
plot_confusionmatrix(y_test_pred,y_test,dom='Test')
Train score 0.8528138528138528 Test score 0.7012987012987013 Train Confusion matrix
Test Confusion matrix
plt.figure(figsize=(20,20))
features = data.columns
classes = ['Not heart disease','heart disease']
tree.plot_tree(clf_,feature_names=features,class_names=classes,filled=True)
plt.show()