From 3241f4fec17ff614ae445c09a8b6f2d54ef1b8ba Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 6 Jan 2022 09:38:04 +0100
Subject: [PATCH] Use score in tree hyperpatameter notebook

---
 python_scripts/trees_hyperparameters.py | 71 +++++++++++++++++++------
 1 file changed, 56 insertions(+), 15 deletions(-)

diff --git a/python_scripts/trees_hyperparameters.py b/python_scripts/trees_hyperparameters.py
index 0d4742c37..24182dc12 100644
--- a/python_scripts/trees_hyperparameters.py
+++ b/python_scripts/trees_hyperparameters.py
@@ -114,13 +114,23 @@ def plot_regression(model, X, y, ax=None):
 # %%
 plot_classification(tree_clf, data_clf[data_clf_columns],
                     data_clf[target_clf_column])
+accuracy = tree_clf.score(data_clf[data_clf_columns],
+                          data_clf[target_clf_column])
+
 plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
-_ = plt.title(f"Shallow classification tree with max-depth of {max_depth}")
+_ = plt.title(
+    f"Shallow classification tree with max-depth of {max_depth}"
+    f"\n Accuracy of the fit: {accuracy:.2f}"
+)
 
 # %%
-plot_regression(tree_reg, data_reg[data_reg_columns],
-                data_reg[target_reg_column])
-_ = plt.title(f"Shallow regression tree with max-depth of {max_depth}")
+plot_regression(tree_reg, data_reg[data_reg_columns], data_reg[target_reg_column])
+accuracy = tree_reg.score(data_reg[data_reg_columns], data_reg[target_reg_column])
+
+_ = plt.title(
+    f"Shallow regression tree with max-depth of {max_depth}"
+    f"\n R$^2$ of the fit: {accuracy:.2f}"
+)
 
 # %% [markdown]
 # Now, let's increase the `max_depth` parameter value to check the difference
@@ -134,13 +144,24 @@ def plot_regression(model, X, y, ax=None):
 # %%
 plot_classification(tree_clf, data_clf[data_clf_columns],
                     data_clf[target_clf_column])
+accuracy = tree_clf.score(data_clf[data_clf_columns],
+                          data_clf[target_clf_column])
+
 plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
-_ = plt.title(f"Deep classification tree with max-depth of {max_depth}")
+_ = plt.title(
+    f"Shallow classification tree with max-depth of {max_depth}"
+    f"\n Accuracy of the fit: {accuracy:.2f}"
+)
 
 # %%
 plot_regression(tree_reg, data_reg[data_reg_columns],
                 data_reg[target_reg_column])
-_ = plt.title(f"Deep regression tree with max-depth of {max_depth}")
+accuracy = tree_reg.score(data_reg[data_reg_columns], data_reg[target_reg_column])
+
+_ = plt.title(
+    f"Shallow regression tree with max-depth of {max_depth}"
+    f"\n R$^2$ of the fit: {accuracy:.2f}"
+)
 
 # %% [markdown]
 # For both classification and regression setting, we observe that
@@ -160,15 +181,23 @@ def plot_regression(model, X, y, ax=None):
 # %%
 plot_classification(tree_clf, data_clf[data_clf_columns],
                     data_clf[target_clf_column])
+accuracy = tree_clf.score(data_clf[data_clf_columns],
+                          data_clf[target_clf_column])
+
 plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
 _ = plt.title(f"Optimal depth found via CV: "
-              f"{tree_clf.best_params_['max_depth']}")
+          f"{tree_clf.best_params_['max_depth']}"
+          f"\n Accuracy of the fit: {accuracy:.2f}")
 
 # %%
 plot_regression(tree_reg, data_reg[data_reg_columns],
                 data_reg[target_reg_column])
+accuracy = tree_reg.score(data_reg[data_reg_columns],
+                          data_reg[target_reg_column])
+
 _ = plt.title(f"Optimal depth found via CV: "
-              f"{tree_reg.best_params_['max_depth']}")
+          f"{tree_reg.best_params_['max_depth']}"
+          f"\n R$^2$ of the fit: {accuracy:.2f}")
 
 # %% [markdown]
 # With this example, we see that there is not a single value that is optimal
@@ -225,7 +254,11 @@ def plot_regression(model, X, y, ax=None):
 tree_clf = DecisionTreeClassifier(max_depth=max_depth)
 plot_classification(tree_clf, data_clf[data_clf_columns],
                     data_clf[target_clf_column])
-_ = plt.title(f"Decision tree with max-depth of {max_depth}")
+accuracy = tree_clf.score(data_clf[data_clf_columns],
+                          data_clf[target_clf_column])
+
+_ = plt.title(f"Decision tree with max-depth of {max_depth}"
+          f"\n Accuracy of the fit: {accuracy:.2f}")
 
 # %% [markdown]
 # As expected, we see that the blue blob on the right and the red blob on the
@@ -252,7 +285,11 @@ def plot_regression(model, X, y, ax=None):
 tree_clf = DecisionTreeClassifier(max_depth=max_depth)
 plot_classification(tree_clf, data_clf[data_clf_columns],
                     data_clf[target_clf_column])
-_ = plt.title(f"Decision tree with max-depth of {max_depth}")
+accuracy = tree_clf.score(data_clf[data_clf_columns],
+                          data_clf[target_clf_column])
+
+_ = plt.title(f"Decision tree with max-depth of {max_depth}"
+          f"\n Accuracy of the fit: {accuracy:.2f}")
 
 # %%
 _, ax = plt.subplots(figsize=(11, 7))
@@ -274,15 +311,19 @@ def plot_regression(model, X, y, ax=None):
 tree_clf = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
 plot_classification(tree_clf, data_clf[data_clf_columns],
                     data_clf[target_clf_column])
+accuracy = tree_clf.score(data_clf[data_clf_columns],
+                          data_clf[target_clf_column])
+
 _ = plt.title(
-    f"Decision tree with leaf having at least {min_samples_leaf} samples")
+    f"Decision tree with leaf having at least {min_samples_leaf} samples"
+    f"\n Accuracy of the fit: {accuracy:.2f}")
 
 # %%
 _, ax = plt.subplots(figsize=(10, 7))
 _ = plot_tree(tree_clf, ax=ax, feature_names=data_clf_columns)
 
 # %% [markdown]
-# This hyperparameter allows to have leaves with a minimum number of samples
-# and no further splits will be search otherwise. Therefore, these
-# hyperparameters could be an alternative to fix the `max_depth`
-# hyperparameter.
+# This hyperparameter allows to have leaves with a minimum number of samples and
+# no further splits will be search otherwise. Similarly, the rest of the above
+# mentioned hyperparameters can be tuned as an alternative to fixing the
+# `max_depth`.