From 3241f4fec17ff614ae445c09a8b6f2d54ef1b8ba Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 6 Jan 2022 09:38:04 +0100 Subject: [PATCH] Use score in tree hyperpatameter notebook --- python_scripts/trees_hyperparameters.py | 71 +++++++++++++++++++------ 1 file changed, 56 insertions(+), 15 deletions(-) diff --git a/python_scripts/trees_hyperparameters.py b/python_scripts/trees_hyperparameters.py index 0d4742c37..24182dc12 100644 --- a/python_scripts/trees_hyperparameters.py +++ b/python_scripts/trees_hyperparameters.py @@ -114,13 +114,23 @@ def plot_regression(model, X, y, ax=None): # %% plot_classification(tree_clf, data_clf[data_clf_columns], data_clf[target_clf_column]) +accuracy = tree_clf.score(data_clf[data_clf_columns], + data_clf[target_clf_column]) + plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') -_ = plt.title(f"Shallow classification tree with max-depth of {max_depth}") +_ = plt.title( + f"Shallow classification tree with max-depth of {max_depth}" + f"\n Accuracy of the fit: {accuracy:.2f}" +) # %% -plot_regression(tree_reg, data_reg[data_reg_columns], - data_reg[target_reg_column]) -_ = plt.title(f"Shallow regression tree with max-depth of {max_depth}") +plot_regression(tree_reg, data_reg[data_reg_columns], data_reg[target_reg_column]) +accuracy = tree_reg.score(data_reg[data_reg_columns], data_reg[target_reg_column]) + +_ = plt.title( + f"Shallow regression tree with max-depth of {max_depth}" + f"\n R$^2$ of the fit: {accuracy:.2f}" +) # %% [markdown] # Now, let's increase the `max_depth` parameter value to check the difference @@ -134,13 +144,24 @@ def plot_regression(model, X, y, ax=None): # %% plot_classification(tree_clf, data_clf[data_clf_columns], data_clf[target_clf_column]) +accuracy = tree_clf.score(data_clf[data_clf_columns], + data_clf[target_clf_column]) + plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') -_ = plt.title(f"Deep classification tree with max-depth of {max_depth}") +_ = plt.title( + f"Shallow classification tree with max-depth of {max_depth}" + f"\n Accuracy of the fit: {accuracy:.2f}" +) # %% plot_regression(tree_reg, data_reg[data_reg_columns], data_reg[target_reg_column]) -_ = plt.title(f"Deep regression tree with max-depth of {max_depth}") +accuracy = tree_reg.score(data_reg[data_reg_columns], data_reg[target_reg_column]) + +_ = plt.title( + f"Shallow regression tree with max-depth of {max_depth}" + f"\n R$^2$ of the fit: {accuracy:.2f}" +) # %% [markdown] # For both classification and regression setting, we observe that @@ -160,15 +181,23 @@ def plot_regression(model, X, y, ax=None): # %% plot_classification(tree_clf, data_clf[data_clf_columns], data_clf[target_clf_column]) +accuracy = tree_clf.score(data_clf[data_clf_columns], + data_clf[target_clf_column]) + plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') _ = plt.title(f"Optimal depth found via CV: " - f"{tree_clf.best_params_['max_depth']}") + f"{tree_clf.best_params_['max_depth']}" + f"\n Accuracy of the fit: {accuracy:.2f}") # %% plot_regression(tree_reg, data_reg[data_reg_columns], data_reg[target_reg_column]) +accuracy = tree_reg.score(data_reg[data_reg_columns], + data_reg[target_reg_column]) + _ = plt.title(f"Optimal depth found via CV: " - f"{tree_reg.best_params_['max_depth']}") + f"{tree_reg.best_params_['max_depth']}" + f"\n R$^2$ of the fit: {accuracy:.2f}") # %% [markdown] # With this example, we see that there is not a single value that is optimal @@ -225,7 +254,11 @@ def plot_regression(model, X, y, ax=None): tree_clf = DecisionTreeClassifier(max_depth=max_depth) plot_classification(tree_clf, data_clf[data_clf_columns], data_clf[target_clf_column]) -_ = plt.title(f"Decision tree with max-depth of {max_depth}") +accuracy = tree_clf.score(data_clf[data_clf_columns], + data_clf[target_clf_column]) + +_ = plt.title(f"Decision tree with max-depth of {max_depth}" + f"\n Accuracy of the fit: {accuracy:.2f}") # %% [markdown] # As expected, we see that the blue blob on the right and the red blob on the @@ -252,7 +285,11 @@ def plot_regression(model, X, y, ax=None): tree_clf = DecisionTreeClassifier(max_depth=max_depth) plot_classification(tree_clf, data_clf[data_clf_columns], data_clf[target_clf_column]) -_ = plt.title(f"Decision tree with max-depth of {max_depth}") +accuracy = tree_clf.score(data_clf[data_clf_columns], + data_clf[target_clf_column]) + +_ = plt.title(f"Decision tree with max-depth of {max_depth}" + f"\n Accuracy of the fit: {accuracy:.2f}") # %% _, ax = plt.subplots(figsize=(11, 7)) @@ -274,15 +311,19 @@ def plot_regression(model, X, y, ax=None): tree_clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf) plot_classification(tree_clf, data_clf[data_clf_columns], data_clf[target_clf_column]) +accuracy = tree_clf.score(data_clf[data_clf_columns], + data_clf[target_clf_column]) + _ = plt.title( - f"Decision tree with leaf having at least {min_samples_leaf} samples") + f"Decision tree with leaf having at least {min_samples_leaf} samples" + f"\n Accuracy of the fit: {accuracy:.2f}") # %% _, ax = plt.subplots(figsize=(10, 7)) _ = plot_tree(tree_clf, ax=ax, feature_names=data_clf_columns) # %% [markdown] -# This hyperparameter allows to have leaves with a minimum number of samples -# and no further splits will be search otherwise. Therefore, these -# hyperparameters could be an alternative to fix the `max_depth` -# hyperparameter. +# This hyperparameter allows to have leaves with a minimum number of samples and +# no further splits will be search otherwise. Similarly, the rest of the above +# mentioned hyperparameters can be tuned as an alternative to fixing the +# `max_depth`.