From 4655af1c80235384393a0dfa70d4fe3d52982ef7 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Sat, 20 Jul 2019 06:16:13 -0500
Subject: [PATCH 01/31] Class Design + Ensemble

---
 autokaggle/__init__.py                        |   2 +
 autokaggle/ensemblers.py                      | 139 +++++++++
 autokaggle/hparam_space/knn_hp.json           |   1 +
 autokaggle/hparam_space/lgbm_hp.json          |   1 +
 autokaggle/hparam_space/rf_hp.json            |   1 +
 autokaggle/hparam_space/svm_hp.json           |   1 +
 autokaggle/tabular_classifiers.py             |  57 ++++
 autokaggle/tabular_regressors.py              |  34 ++
 autokaggle/tabular_supervised.py              | 293 +++++++-----------
 autokaggle/utils.py                           |   8 +
 examples/tabular_classification_binary.py     |   6 +-
 examples/tabular_classification_multiclass.py |   4 +-
 examples/tabular_regression.py                |   2 +-
 13 files changed, 358 insertions(+), 191 deletions(-)
 create mode 100644 autokaggle/ensemblers.py
 create mode 100644 autokaggle/hparam_space/knn_hp.json
 create mode 100644 autokaggle/hparam_space/lgbm_hp.json
 create mode 100644 autokaggle/hparam_space/rf_hp.json
 create mode 100644 autokaggle/hparam_space/svm_hp.json
 create mode 100644 autokaggle/tabular_classifiers.py
 create mode 100644 autokaggle/tabular_regressors.py

diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py
index e69de29..14439bc 100644
--- a/autokaggle/__init__.py
+++ b/autokaggle/__init__.py
@@ -0,0 +1,2 @@
+from .tabular_preprocessor import *
+from .tabular_supervised import *
\ No newline at end of file
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
new file mode 100644
index 0000000..9081211
--- /dev/null
+++ b/autokaggle/ensemblers.py
@@ -0,0 +1,139 @@
+from sklearn.base import BaseEstimator
+from tabular_preprocessor import TabularPreprocessor
+from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+from abc import abstractmethod
+import numpy as np
+import os
+import random
+import json
+
+from lightgbm import LGBMClassifier, LGBMRegressor
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import StratifiedKFold, KFold
+from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
+from joblib import dump, load
+
+
+class RankedEnsembler:
+    def __init__(self, ensemble_method='max_voting'):
+        self.ensemble_method = ensemble_method
+        
+    def fit(self, predictions, y_true):
+        pass
+    
+    def predict(self, predictions):
+        if self.ensemble_method == 'median':
+            return predictions.apply(np.median, axis=1).values
+        elif self.ensemble_method == 'mean':
+            return predictions.apply(np.average, axis=1).values
+        elif self.ensemble_method == 'max':
+            return predictions.apply(np.max, axis=1).values
+        elif self.ensemble_method == 'min':
+            return predictions.apply(np.min, axis=1).values
+        elif self.ensemble_method == 'max_voting':
+            return predictions.apply(mode, axis=1).values
+
+class EnsembleSelection:
+    def __init__(self, task_type, metric, mode='fast'):
+        self.ensemble_size = None
+        self.mode = mode
+        self.task_type = task_type
+        self.metric = metric
+        
+    def fit(self, predictions, y_true):
+        if self.mode == 'fast':
+            self.fast(predictions, y_true)
+        else:
+            self.slow(predictions, y_true)
+        self._calculate_weights()
+    
+    def predict(self, predictions):
+        return np.average(predictions, axis=0, weights=self.weights_)
+        
+    def fast(self, predictions, labels):
+        """Fast version of Rich Caruana's ensemble selection method."""
+        self.ensemble_size = len(predictions)
+
+        ensemble = []
+        trajectory = []
+        order = []
+
+        for i in range(self.ensemble_size):
+            scores = np.zeros((len(predictions)))
+            s = len(ensemble)
+            if s == 0:
+                weighted_ensemble_prediction = np.zeros(predictions[0].shape)
+            else:
+                # Memory-efficient averaging!
+                ensemble_prediction = np.zeros(ensemble[0].shape)
+                for pred in ensemble:
+                    ensemble_prediction += pred
+                ensemble_prediction /= s
+
+                weighted_ensemble_prediction = (s / float(s + 1)) * \
+                                               ensemble_prediction
+            fant_ensemble_prediction = np.zeros(weighted_ensemble_prediction.shape)
+            for j, pred in enumerate(predictions):
+                fant_ensemble_prediction[:,:] = weighted_ensemble_prediction + \
+                                             (1. / float(s + 1)) * pred
+                scores[j] = calculate_score(
+                    solution=labels,
+                    prediction=fant_ensemble_prediction,
+                    task_type=self.task_type,
+                    metric=self.metric,
+                    all_scoring_functions=False)
+
+            all_best = np.argwhere(scores == np.nanmin(scores)).flatten()
+            best = np.random.RandomState.choice(all_best)
+            ensemble.append(predictions[best])
+            trajectory.append(scores[best])
+            order.append(best)
+
+            # Handle special case
+            if len(predictions) == 1:
+                break
+
+        self.indices_ = order
+
+    def slow(self, predictions, labels):
+        """Rich Caruana's ensemble selection method."""
+        self.ensemble_size = len(predictions)
+
+        ensemble = []
+        trajectory = []
+        order = []
+        
+        for i in range(self.ensemble_size):
+            scores = np.zeros([predictions.shape[0]])
+            for j, pred in enumerate(predictions):
+                ensemble.append(pred)
+                ensemble_prediction = np.mean(np.array(ensemble), axis=0)
+                scores[j] = calculate_score(
+                    solution=labels,
+                    prediction=ensemble_prediction,
+                    task_type=self.task_type,
+                    metric=self.metric,
+                    all_scoring_functions=False)
+                ensemble.pop()
+            best = np.nanargmin(scores)
+            ensemble.append(predictions[best])
+            trajectory.append(scores[best])
+            order.append(best)
+
+            # Handle special case
+            if len(predictions) == 1:
+                break
+
+        self.indices_ = np.array(order)
+        
+    def _calculate_weights(self):
+        ensemble_members = Counter(self.indices_).most_common()
+        weights = np.zeros((self.ensemble_size,), dtype=float)
+        for ensemble_member in ensemble_members:
+            weight = float(ensemble_member[1]) / self.ensemble_size
+            weights[ensemble_member[0]] = weight
+
+        if np.sum(weights) < 1:
+            weights = weights / np.sum(weights)
+
+        self.weights_ = weights
\ No newline at end of file
diff --git a/autokaggle/hparam_space/knn_hp.json b/autokaggle/hparam_space/knn_hp.json
new file mode 100644
index 0000000..609ff31
--- /dev/null
+++ b/autokaggle/hparam_space/knn_hp.json
@@ -0,0 +1 @@
+{"n_neighbors": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], "metric": ["minkowski", "euclidean", "manhattan", "chebyshev"], "p": [1, 2, 3]}
\ No newline at end of file
diff --git a/autokaggle/hparam_space/lgbm_hp.json b/autokaggle/hparam_space/lgbm_hp.json
new file mode 100644
index 0000000..b694b11
--- /dev/null
+++ b/autokaggle/hparam_space/lgbm_hp.json
@@ -0,0 +1 @@
+{"boosting_type": ["gbdt"], "min_split_gain": [0.1], "max_depth": [5, 8, 10], "min_child_weight": [5, 10, 30, 50, 60, 80, 100], "colsample_bytree": [0.6, 0.7], "learning_rate": [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12], "subsample": [0.8], "num_leaves": [80], "n_estimators": [50, 100, 150, 200]}
\ No newline at end of file
diff --git a/autokaggle/hparam_space/rf_hp.json b/autokaggle/hparam_space/rf_hp.json
new file mode 100644
index 0000000..a56221a
--- /dev/null
+++ b/autokaggle/hparam_space/rf_hp.json
@@ -0,0 +1 @@
+{"criterion": ["entropy", "gini"], "max_features": [0.1, 0.3, 0.5, 0.7, 1.0], "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_split": [2, 3, 4], "min_samples_leaf": [1, 2, 3], "n_estimators": [50, 100, 150, 200]}
\ No newline at end of file
diff --git a/autokaggle/hparam_space/svm_hp.json b/autokaggle/hparam_space/svm_hp.json
new file mode 100644
index 0000000..8642e7f
--- /dev/null
+++ b/autokaggle/hparam_space/svm_hp.json
@@ -0,0 +1 @@
+{"C": [0.001, 0.1, 1, 10, 100, 1000, 10000], "gamma": [1e-05, 100000.0], "kernel": ["rbf", "poly", "linear", "sigmoid"], "degree": [2, 3, 4, 5], "max_iter": [50000]}
\ No newline at end of file
diff --git a/autokaggle/tabular_classifiers.py b/autokaggle/tabular_classifiers.py
new file mode 100644
index 0000000..e832e93
--- /dev/null
+++ b/autokaggle/tabular_classifiers.py
@@ -0,0 +1,57 @@
+from sklearn.base import BaseEstimator
+from tabular_preprocessor import TabularPreprocessor
+from tabular_supervised import AutoKaggle, TabularEstimator, LGBMMixIn, SklearnMixIn
+from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+from abc import abstractmethod
+import numpy as np
+import os
+import random
+import json
+
+from lightgbm import LGBMClassifier, LGBMRegressor
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import StratifiedKFold, KFold
+from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
+from joblib import dump, load
+
+class TabularClassifier(TabularEstimator):
+    """TabularClassifier class.
+     It is used for tabular data classification with lightgbm classifier.
+    """ 
+    def __init__(self, path=None, verbose=True, time_limit=None):
+        super().__init__(path, verbose, time_limit)
+        self.objective = 'classification'
+
+    def get_skf(self, folds):
+        if self.objective == 'binary':
+            score_metric = 'roc_auc'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
+        else:
+            score_metric = 'f1_weighted'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
+        return score_metric, skf
+    
+class SVMClassifier(TabularClassifier, SklearnMixIn):
+    _default_hyperparams = "svm_hp.json"
+        
+    def init_model(self, y):
+        n_classes = len(set(y))
+        self.objective = 'binary' if n_classes == 2 else 'multiclass'
+        self.estimator = LinearSVC()
+    
+class LgbmClassifier(TabularClassifier, LGBMMixIn):
+    def init_model(self, y):
+        n_classes = len(set(y))
+        if n_classes == 2:
+            self.objective = 'binary'
+            self.estimator = LGBMClassifier(silent=False,
+                                       verbose=-1,
+                                       n_jobs=1,
+                                       objective=self.objective)
+        else:
+            self.objective = 'multiclass'
+            self.estimator = LGBMClassifier(silent=False,
+                                       verbose=-1,
+                                       n_jobs=1,
+                                       num_class=n_classes,
+                                       objective=self.objective)
\ No newline at end of file
diff --git a/autokaggle/tabular_regressors.py b/autokaggle/tabular_regressors.py
new file mode 100644
index 0000000..1b6d358
--- /dev/null
+++ b/autokaggle/tabular_regressors.py
@@ -0,0 +1,34 @@
+from sklearn.base import BaseEstimator
+from tabular_preprocessor import TabularPreprocessor
+from tabular_supervised import AutoKaggle, TabularEstimator, LGBMMixIn, SklearnMixIn
+from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+from abc import abstractmethod
+import numpy as np
+import os
+import random
+import json
+
+from lightgbm import LGBMClassifier, LGBMRegressor
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import StratifiedKFold, KFold
+from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
+from joblib import dump, load
+
+
+class TabularRegressor(TabularEstimator):
+    """TabularRegressor class.
+    It is used for tabular data regression with lightgbm regressor.
+    """
+    def __init__(self, path=None, verbose=True, time_limit=None):
+        super().__init__(path, verbose, time_limit)
+        self.objective = 'regression'
+
+    def get_skf(self, folds):
+        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001)
+
+class LgbmRegressor(TabularRegressor, LGBMMixIn):
+    def init_model(self, y):
+        self.estimator = LGBMRegressor(silent=False,
+                                  verbose=-1,
+                                  n_jobs=1,
+                                  objective=self.objective)
\ No newline at end of file
diff --git a/autokaggle/tabular_supervised.py b/autokaggle/tabular_supervised.py
index 3f74390..64353eb 100644
--- a/autokaggle/tabular_supervised.py
+++ b/autokaggle/tabular_supervised.py
@@ -1,72 +1,39 @@
+from sklearn.base import BaseEstimator
+from tabular_preprocessor import TabularPreprocessor
+from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 from abc import abstractmethod
-
+import numpy as np
 import os
+import random
+import json
+
 from lightgbm import LGBMClassifier, LGBMRegressor
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.model_selection import StratifiedKFold, KFold
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
-import numpy as np
-import random
-
-from autokaggle.tabular_preprocessor import TabularPreprocessor
-from autokaggle.utils import rand_temp_folder_generator, ensure_dir
+from joblib import dump, load
 
 
-class TabularSupervised:
-    def __init__(self, path=None, verbose=True):
+class AutoKagggle(BaseEstimator):
+    def __init__(self, estimator_class=LgbmClassifier, path=None, verbose=True):
         """
         Initialization function for tabular supervised learner.
         """
         self.verbose = verbose
         self.is_trained = False
-        self.clf = None
         self.objective = None
         self.tabular_preprocessor = None
+        self.model = None
+        self.estimator_class = estimator_class
         self.path = path if path is not None else rand_temp_folder_generator()
         ensure_dir(self.path)
         if self.verbose:
             print('Path:', path)
-        self.save_filename = os.path.join(self.path, 'lgbm.txt')
         self.time_limit = None
-        self.lgbm = None
-
-    def search(self, search_space, search_iter, n_estimators, x, y):
-        if 'n_estimators' in search_space:
-            del search_space['n_estimators']
-        params = {
-            'boosting_type': ['gbdt'],
-            'min_child_weight': [5],
-            'min_split_gain': [1.0],
-            'subsample': [0.8],
-            'colsample_bytree': [0.6],
-            'max_depth': [10],
-            'n_estimators': n_estimators,
-            'num_leaves': [70],
-            'learning_rate': [0.04],
-        }
-        params.update(search_space)
-        if self.verbose:
-            print(params)
-        folds = 3
-        score_metric, skf = self.get_skf(folds)
-
-        random_search = RandomizedSearchCV(self.lgbm, param_distributions=params, n_iter=search_iter,
-                                           scoring=score_metric,
-                                           n_jobs=1, cv=skf, verbose=0, random_state=1001)
-
-        random_search.fit(x, y)
-        self.clf = random_search.best_estimator_
-
-        return random_search.best_params_
-
-    @abstractmethod
-    def get_skf(self, folds):
-        pass
 
     def fit(self, x, y, time_limit=None, data_info=None):
         """
         This function should train the model parameters.
-
         Args:
             x: A numpy.ndarray instance containing the training data.
             y: training label vector.
@@ -84,173 +51,129 @@ def fit(self, x, y, time_limit=None, data_info=None):
         if time_limit is None:
             time_limit = 24 * 60 * 60
         self.time_limit = time_limit
-
-        self.init_lgbm(y)
-
-        self.tabular_preprocessor = TabularPreprocessor()
-
+        
         if x.shape[1] == 0:
             raise ValueError("No feature exist!")
 
-        x = self.tabular_preprocessor.fit(x, y, self.time_limit, data_info)
-
-        if x.shape[0] > 600:
-            grid_train_percentage = max(600.0 / x.shape[0], 0.1)
-        else:
-            grid_train_percentage = 1
-        grid_n = int(x.shape[0] * grid_train_percentage)
-        idx = random.sample(list(range(x.shape[0])), grid_n)
-
-        grid_train_x = x[idx, :]
-        grid_train_y = y[idx]
-
         while x.shape[0] < 60:
             x = np.concatenate([x, x], axis=0)
             y = np.concatenate([y, y], axis=0)
-
-        response_rate = sum(y) / len(y)
-
-        if not self.is_trained:
-            # Two-step cross-validation for hyperparameter selection
-            if self.verbose:
-                print('-----------------Search Regularization Params---------------------')
-            if response_rate < 0.005:
-                depth_choice = [5]
-            else:
-                depth_choice = [8, 10]
-
-            params = {
-                'min_split_gain': [0.1],
-                'max_depth': depth_choice,
-                'min_child_weight': [5, 10, 30, 50, 60, 80, 100],
-                'colsample_bytree': [0.6, 0.7],
-                'learning_rate': [0.3],
-                'subsample': [0.8],
-                'num_leaves': [80],
-            }
-
-            search_iter = 14
-            n_estimators_choice = [50]
-            best_param = self.search(
-                params,
-                search_iter,
-                n_estimators_choice,
-                grid_train_x, grid_train_y)
-
-            if self.verbose:
-                print('-----------------Search Learning Rate---------------------')
-            for key, value in best_param.items():
-                best_param[key] = [value]
-            best_param['learning_rate'] = [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12]
-            n_estimators_choice = [100, 150, 200]
-            search_iter = 16
-
-            self.search(
-                best_param,
-                search_iter,
-                n_estimators_choice,
-                grid_train_x, grid_train_y)
-
-            if self.verbose:
-                print('self.clf', self.clf)
-            self.is_trained = True
-
-        # Fit Model
-        self.clf.fit(x, y)
-
-        self.clf.booster_.save_model(self.save_filename)
+        
+        # Init model and preprocessor
+        self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit)
+        self.tabular_preprocessor = TabularPreprocessor()
+            
+        # Fit Model and preprocessor
+        x = self.tabular_preprocessor.fit(x, y, self.time_limit, data_info)
+        self.model.fit(x, y)
+        self.model.save_model()
+        self.is_trained = True
 
         if self.verbose:
             print("The whole available data is: ")
             print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1]))
 
-            print('Feature Importance:')
-            print(self.clf.feature_importances_)
-
-    @abstractmethod
-    def init_lgbm(self, y):
-        pass
-
     def predict(self, x_test):
         """
         This function should provide predictions of labels on (test) data.
         The function predict eventually casdn return probabilities or continuous values.
         """
         x_test = self.tabular_preprocessor.encode(x_test)
-        y = self.clf.predict(x_test, )
+        y = self.model.predict(x_test, )
         if y is None:
             raise ValueError("Tabular predictor does not exist")
         return y
 
-    @abstractmethod
-    def evaluate(self, x_test, y_test):
-        pass
-
-    def final_fit(self, x_train, y_train):
-        x_train = self.tabular_preprocessor.encode(x_train)
-        self.clf.fit(x_train, y_train)
-
-
-class TabularRegressor(TabularSupervised):
-    """TabularRegressor class.
-    It is used for tabular data regression with lightgbm regressor.
-    """
-
-    def __init__(self, path=None):
-        super().__init__(path)
-        self.objective = 'regression'
-
-    def evaluate(self, x_test, y_test):
-        y_pred = self.predict(x_test)
-        return mean_squared_error(y_test, y_pred)
-
-    def init_lgbm(self, y):
-        self.lgbm = LGBMRegressor(silent=False,
-                                  verbose=-1,
-                                  n_jobs=1,
-                                  objective=self.objective)
-
-    def get_skf(self, folds):
-        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001)
-
-
-class TabularClassifier(TabularSupervised):
-    """TabularClassifier class.
-     It is used for tabular data classification with lightgbm classifier.
-    """
-
-    def init_lgbm(self, y):
-        n_classes = len(set(y))
-        if n_classes == 2:
-            self.objective = 'binary'
-            self.lgbm = LGBMClassifier(silent=False,
-                                       verbose=-1,
-                                       n_jobs=1,
-                                       objective=self.objective)
-        else:
-            self.objective = 'multiclass'
-            self.lgbm = LGBMClassifier(silent=False,
-                                       verbose=-1,
-                                       n_jobs=1,
-                                       num_class=n_classes,
-                                       objective=self.objective)
-
     def evaluate(self, x_test, y_test):
         if self.verbose:
-            print('objective:', self.objective)
+            print('objective:', self.model.objective)
         y_pred = self.predict(x_test)
         results = None
-        if self.objective == 'binary':
+        if self.model.objective == 'binary':
             results = roc_auc_score(y_test, y_pred)
-        elif self.objective == 'multiclass':
+        elif self.model.objective == 'multiclass':
             results = f1_score(y_test, y_pred, average='weighted')
+        elif self.model.objective == 'regression':
+            results = mean_squared_error(y_test, y_pred)
         return results
 
-    def get_skf(self, folds):
-        if self.lgbm.objective == 'binary':
-            score_metric = 'roc_auc'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
+    def final_fit(self, x_train, y_train):
+        x_train = self.tabular_preprocessor.encode(x_train)
+        self.model.fit(x_train, y_train)
+
+class TabularEstimator(BaseEstimator):
+    def __init__(self, path=None, verbose=True, time_limit=None):
+        """
+        Initialization function for tabular supervised learner.
+        """
+        self.verbose = verbose
+        self.path = path
+        self.time_limit = time_limit
+        self.objective = None
+        self.hparams = read_json(self._default_hyperparams)
+        self.clf = None
+        self.estimator = None
+    
+    def fit(self, x, y):
+        self.init_model(y)
+        self.search(x, y)
+        self.clf.fit(x, y)
+        self.save_model()
+    
+    def predict(self, x, y=None):
+        y = self.clf.predict(x, )
+        return y
+    
+    def search(self, x, y, search_iter=40, folds=3):
+        # Set small sample for hyper-param search
+        if x.shape[0] > 600:
+            grid_train_percentage = max(600.0 / x.shape[0], 0.1)
         else:
-            score_metric = 'f1_weighted'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
-        return score_metric, skf
+            grid_train_percentage = 1
+        grid_n = int(x.shape[0] * grid_train_percentage)
+        idx = random.sample(list(range(x.shape[0])), grid_n)
+        grid_train_x, grid_train_y = x[idx, :], y[idx]
+        
+        if self.verbose: print(self.hparams)
+        score_metric, skf = self.get_skf(folds)
+        random_search = RandomizedSearchCV(self.estimator, param_distributions=self.hparams, n_iter=search_iter,
+                                           scoring=score_metric,
+                                           n_jobs=1, cv=skf, verbose=0, random_state=1001)
+
+        random_search.fit(grid_train_x, grid_train_y)
+        self.clf = random_search.best_estimator_
+
+        return random_search.best_params_
+            
+    @abstractmethod
+    def save_model(self):
+        pass
+    
+    @abstractmethod
+    def init_model(self, y):
+        pass
+    
+    @abstractmethod
+    def get_skf(self, folds):
+        pass
+    
+    def __repr__(self):
+        return "Estimator model"
+class LGBMMixIn:
+    _default_hyperparams = "lgbm_hp.json"
+    
+    def save_model(self):
+        self.clf.booster_.save_model(self.save_filename)
+    
+    def get_feature_importance(self):
+        if self.estimator:
+            print('Feature Importance:')
+            print(self.clf.feature_importances_)
+            
+class SklearnMixIn:
+    
+    def save_model(self):
+        dump(self.clf, self.save_filename)
+        
+    def load_model(self):
+        self.clf = load(self.save_filename)
\ No newline at end of file
diff --git a/autokaggle/utils.py b/autokaggle/utils.py
index 62b833f..cfb3deb 100644
--- a/autokaggle/utils.py
+++ b/autokaggle/utils.py
@@ -25,3 +25,11 @@ def rand_temp_folder_generator():
     path = sys_temp + '_' + random_suffix
     ensure_dir(path)
     return path
+
+def write_json(data, filename):
+    with open(filename, 'w') as outfile:
+        json.dump(data, outfile)
+        
+def read_json(filename):
+    with open(filename, 'rb') as infile:
+        return json.load(infile)
diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py
index df472a5..cfe0b9f 100644
--- a/examples/tabular_classification_binary.py
+++ b/examples/tabular_classification_binary.py
@@ -1,5 +1,5 @@
 import numpy as np
-from autokaggle import TabularClassifier
+from autokaggle import AutoKaggle
 
 if __name__ == '__main__':
     ntime, nnum, ncat = 4, 10, 8
@@ -16,9 +16,9 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = TabularClassifier()
+    clf = AutoKagggle()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
     AUC = clf.evaluate(x_test, y_test)
-    print(AUC)
+    print(AUC)
\ No newline at end of file
diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py
index 7515841..d62eb51 100644
--- a/examples/tabular_classification_multiclass.py
+++ b/examples/tabular_classification_multiclass.py
@@ -16,9 +16,9 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = TabularClassifier()
+    clf = AutoKagggle()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
     F1_score = clf.evaluate(x_test, y_test)
-    print(F1_score)
+    print(F1_score)
\ No newline at end of file
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index 8ba95a8..7a47cc8 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -16,7 +16,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = TabularRegressor()
+    clf = AutoKagggle(LgbmRegressor)
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 

From 289805c6cf9b5fda556e9c4b5d77c450180490a1 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 25 Jul 2019 09:49:03 -0500
Subject: [PATCH 02/31] Refactor

---
 autokaggle/__init__.py                        |   4 +-
 .../{tabular_supervised.py => auto_ml.py}     | 100 ++----------------
 autokaggle/hparam_space/lgbm_hp.json          |   2 +-
 ...abular_preprocessor.py => preprocessor.py} |  27 +++--
 autokaggle/tabular_classifiers.py             |  57 ----------
 autokaggle/tabular_regressors.py              |  34 ------
 autokaggle/utils.py                           |   1 +
 examples/tabular_classification_binary.py     |   6 +-
 examples/tabular_classification_multiclass.py |   6 +-
 examples/tabular_regression.py                |   6 +-
 10 files changed, 44 insertions(+), 199 deletions(-)
 rename autokaggle/{tabular_supervised.py => auto_ml.py} (52%)
 rename autokaggle/{tabular_preprocessor.py => preprocessor.py} (94%)
 delete mode 100644 autokaggle/tabular_classifiers.py
 delete mode 100644 autokaggle/tabular_regressors.py

diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py
index 14439bc..3961f6d 100644
--- a/autokaggle/__init__.py
+++ b/autokaggle/__init__.py
@@ -1,2 +1,2 @@
-from .tabular_preprocessor import *
-from .tabular_supervised import *
\ No newline at end of file
+from autokaggle.auto_ml import AutoKaggle
+from autokaggle.estimators import *
\ No newline at end of file
diff --git a/autokaggle/tabular_supervised.py b/autokaggle/auto_ml.py
similarity index 52%
rename from autokaggle/tabular_supervised.py
rename to autokaggle/auto_ml.py
index 64353eb..e2936e5 100644
--- a/autokaggle/tabular_supervised.py
+++ b/autokaggle/auto_ml.py
@@ -1,20 +1,17 @@
 from sklearn.base import BaseEstimator
-from tabular_preprocessor import TabularPreprocessor
-from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 from abc import abstractmethod
 import numpy as np
 import os
 import random
 import json
-
-from lightgbm import LGBMClassifier, LGBMRegressor
-from sklearn.model_selection import RandomizedSearchCV
-from sklearn.model_selection import StratifiedKFold, KFold
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
 from joblib import dump, load
 
+from autokaggle.preprocessor import TabularPreprocessor
+from autokaggle.estimators import *
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 
-class AutoKagggle(BaseEstimator):
+class AutoKaggle(BaseEstimator):
     def __init__(self, estimator_class=LgbmClassifier, path=None, verbose=True):
         """
         Initialization function for tabular supervised learner.
@@ -22,7 +19,7 @@ def __init__(self, estimator_class=LgbmClassifier, path=None, verbose=True):
         self.verbose = verbose
         self.is_trained = False
         self.objective = None
-        self.tabular_preprocessor = None
+        self.preprocessor = None
         self.model = None
         self.estimator_class = estimator_class
         self.path = path if path is not None else rand_temp_folder_generator()
@@ -61,10 +58,10 @@ def fit(self, x, y, time_limit=None, data_info=None):
         
         # Init model and preprocessor
         self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit)
-        self.tabular_preprocessor = TabularPreprocessor()
+        self.preprocessor = TabularPreprocessor()
             
         # Fit Model and preprocessor
-        x = self.tabular_preprocessor.fit(x, y, self.time_limit, data_info)
+        x = self.preprocessor.fit(x, y, self.time_limit, data_info)
         self.model.fit(x, y)
         self.model.save_model()
         self.is_trained = True
@@ -78,7 +75,7 @@ def predict(self, x_test):
         This function should provide predictions of labels on (test) data.
         The function predict eventually casdn return probabilities or continuous values.
         """
-        x_test = self.tabular_preprocessor.encode(x_test)
+        x_test = self.preprocessor.encode(x_test)
         y = self.model.predict(x_test, )
         if y is None:
             raise ValueError("Tabular predictor does not exist")
@@ -98,82 +95,5 @@ def evaluate(self, x_test, y_test):
         return results
 
     def final_fit(self, x_train, y_train):
-        x_train = self.tabular_preprocessor.encode(x_train)
-        self.model.fit(x_train, y_train)
-
-class TabularEstimator(BaseEstimator):
-    def __init__(self, path=None, verbose=True, time_limit=None):
-        """
-        Initialization function for tabular supervised learner.
-        """
-        self.verbose = verbose
-        self.path = path
-        self.time_limit = time_limit
-        self.objective = None
-        self.hparams = read_json(self._default_hyperparams)
-        self.clf = None
-        self.estimator = None
-    
-    def fit(self, x, y):
-        self.init_model(y)
-        self.search(x, y)
-        self.clf.fit(x, y)
-        self.save_model()
-    
-    def predict(self, x, y=None):
-        y = self.clf.predict(x, )
-        return y
-    
-    def search(self, x, y, search_iter=40, folds=3):
-        # Set small sample for hyper-param search
-        if x.shape[0] > 600:
-            grid_train_percentage = max(600.0 / x.shape[0], 0.1)
-        else:
-            grid_train_percentage = 1
-        grid_n = int(x.shape[0] * grid_train_percentage)
-        idx = random.sample(list(range(x.shape[0])), grid_n)
-        grid_train_x, grid_train_y = x[idx, :], y[idx]
-        
-        if self.verbose: print(self.hparams)
-        score_metric, skf = self.get_skf(folds)
-        random_search = RandomizedSearchCV(self.estimator, param_distributions=self.hparams, n_iter=search_iter,
-                                           scoring=score_metric,
-                                           n_jobs=1, cv=skf, verbose=0, random_state=1001)
-
-        random_search.fit(grid_train_x, grid_train_y)
-        self.clf = random_search.best_estimator_
-
-        return random_search.best_params_
-            
-    @abstractmethod
-    def save_model(self):
-        pass
-    
-    @abstractmethod
-    def init_model(self, y):
-        pass
-    
-    @abstractmethod
-    def get_skf(self, folds):
-        pass
-    
-    def __repr__(self):
-        return "Estimator model"
-class LGBMMixIn:
-    _default_hyperparams = "lgbm_hp.json"
-    
-    def save_model(self):
-        self.clf.booster_.save_model(self.save_filename)
-    
-    def get_feature_importance(self):
-        if self.estimator:
-            print('Feature Importance:')
-            print(self.clf.feature_importances_)
-            
-class SklearnMixIn:
-    
-    def save_model(self):
-        dump(self.clf, self.save_filename)
-        
-    def load_model(self):
-        self.clf = load(self.save_filename)
\ No newline at end of file
+        x_train = self.preprocessor.encode(x_train)
+        self.model.fit(x_train, y_train)
\ No newline at end of file
diff --git a/autokaggle/hparam_space/lgbm_hp.json b/autokaggle/hparam_space/lgbm_hp.json
index b694b11..b2f6311 100644
--- a/autokaggle/hparam_space/lgbm_hp.json
+++ b/autokaggle/hparam_space/lgbm_hp.json
@@ -1 +1 @@
-{"boosting_type": ["gbdt"], "min_split_gain": [0.1], "max_depth": [5, 8, 10], "min_child_weight": [5, 10, 30, 50, 60, 80, 100], "colsample_bytree": [0.6, 0.7], "learning_rate": [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12], "subsample": [0.8], "num_leaves": [80], "n_estimators": [50, 100, 150, 200]}
\ No newline at end of file
+[{"boosting_type": ["gbdt"], "min_child_weight": [5, 10, 30, 50, 60, 80, 100], "min_split_gain": [0.1], "subsample": [0.8], "colsample_bytree": [0.6, 0.7], "max_depth": [5, 8, 10], "n_estimators": [50], "num_leaves": [80], "learning_rate": [0.3]}, {"learning_rate": [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12], "n_estimators": [100, 150, 200]}]
\ No newline at end of file
diff --git a/autokaggle/tabular_preprocessor.py b/autokaggle/preprocessor.py
similarity index 94%
rename from autokaggle/tabular_preprocessor.py
rename to autokaggle/preprocessor.py
index 1bf6d76..543e99a 100644
--- a/autokaggle/tabular_preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -100,7 +100,7 @@ def __init__(self):
         self.feature_add_cat_cat = 0
         self.order_num_cat_pair = {}
 
-        self.rest = None
+        self.selected_cols = None
         self.budget = None
         self.data_info = None
         self.n_time = None
@@ -108,10 +108,16 @@ def __init__(self):
         self.n_cat = None
 
     def remove_useless(self, x):
-        self.rest = np.where(np.max(x, 0) - np.min(x, 0) != 0)[0]
-        return x[:, self.rest]
+        """
+        Remove the columns whose max_value == min_value
+        """
+        self.selected_cols = np.where(np.max(x, axis=0) - np.min(x, axis=0) != 0)[0]
+        return x[:, self.selected_cols]
 
     def process_time(self, x):
+        """
+        Process the TIME features. Add the difference between consecutive columns as a feature.
+        """
         cols = range(self.n_time)
         if len(cols) > 10:
             cols = cols[:10]
@@ -122,6 +128,9 @@ def process_time(self, x):
         return x
 
     def extract_data(self, raw_x):
+        """
+        Encodes the catgorical data and returns all numeric values
+        """
         # only get numerical variables
         ret = np.concatenate([raw_x['TIME'], raw_x['NUM'], raw_x['CAT']], axis=1)
         n_rows = ret.shape[0]
@@ -242,7 +251,7 @@ def fit(self, raw_x, y, time_limit, data_info):
         # Get Meta-Feature
         self.budget = time_limit
         self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x)
-        print('QQ: {}'.format(self.data_info))
+        print('DATA_INFO: {}'.format(self.data_info))
 
         self.n_time = sum(self.data_info == 'TIME')
         self.n_num = sum(self.data_info == 'NUM')
@@ -250,9 +259,9 @@ def fit(self, raw_x, y, time_limit, data_info):
 
         self.total_samples = raw_x.shape[0]
 
-        print('QQ1: {}'.format(self.n_time))
-        print('QQ2: {}'.format(self.n_num))
-        print('QQ3: {}'.format(self.n_cat))
+        print('#TIME features: {}'.format(self.n_time))
+        print('#NUM features: {}'.format(self.n_num))
+        print('#CAT features: {}'.format(self.n_cat))
         raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'],
                  'NUM': raw_x[:, self.data_info == 'NUM'],
                  'CAT': raw_x[:, self.data_info == 'CAT']}
@@ -321,8 +330,8 @@ def encode(self, raw_x, time_limit=None):
         x = self.cat_to_num(x)
 
         x = self.process_time(x)
-        if self.rest is not None:
-            x = x[:, self.rest]
+        if self.selected_cols is not None:
+            x = x[:, self.selected_cols]
         return x
 
     @staticmethod
diff --git a/autokaggle/tabular_classifiers.py b/autokaggle/tabular_classifiers.py
deleted file mode 100644
index e832e93..0000000
--- a/autokaggle/tabular_classifiers.py
+++ /dev/null
@@ -1,57 +0,0 @@
-from sklearn.base import BaseEstimator
-from tabular_preprocessor import TabularPreprocessor
-from tabular_supervised import AutoKaggle, TabularEstimator, LGBMMixIn, SklearnMixIn
-from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
-from abc import abstractmethod
-import numpy as np
-import os
-import random
-import json
-
-from lightgbm import LGBMClassifier, LGBMRegressor
-from sklearn.model_selection import RandomizedSearchCV
-from sklearn.model_selection import StratifiedKFold, KFold
-from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
-from joblib import dump, load
-
-class TabularClassifier(TabularEstimator):
-    """TabularClassifier class.
-     It is used for tabular data classification with lightgbm classifier.
-    """ 
-    def __init__(self, path=None, verbose=True, time_limit=None):
-        super().__init__(path, verbose, time_limit)
-        self.objective = 'classification'
-
-    def get_skf(self, folds):
-        if self.objective == 'binary':
-            score_metric = 'roc_auc'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
-        else:
-            score_metric = 'f1_weighted'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
-        return score_metric, skf
-    
-class SVMClassifier(TabularClassifier, SklearnMixIn):
-    _default_hyperparams = "svm_hp.json"
-        
-    def init_model(self, y):
-        n_classes = len(set(y))
-        self.objective = 'binary' if n_classes == 2 else 'multiclass'
-        self.estimator = LinearSVC()
-    
-class LgbmClassifier(TabularClassifier, LGBMMixIn):
-    def init_model(self, y):
-        n_classes = len(set(y))
-        if n_classes == 2:
-            self.objective = 'binary'
-            self.estimator = LGBMClassifier(silent=False,
-                                       verbose=-1,
-                                       n_jobs=1,
-                                       objective=self.objective)
-        else:
-            self.objective = 'multiclass'
-            self.estimator = LGBMClassifier(silent=False,
-                                       verbose=-1,
-                                       n_jobs=1,
-                                       num_class=n_classes,
-                                       objective=self.objective)
\ No newline at end of file
diff --git a/autokaggle/tabular_regressors.py b/autokaggle/tabular_regressors.py
deleted file mode 100644
index 1b6d358..0000000
--- a/autokaggle/tabular_regressors.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from sklearn.base import BaseEstimator
-from tabular_preprocessor import TabularPreprocessor
-from tabular_supervised import AutoKaggle, TabularEstimator, LGBMMixIn, SklearnMixIn
-from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
-from abc import abstractmethod
-import numpy as np
-import os
-import random
-import json
-
-from lightgbm import LGBMClassifier, LGBMRegressor
-from sklearn.model_selection import RandomizedSearchCV
-from sklearn.model_selection import StratifiedKFold, KFold
-from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
-from joblib import dump, load
-
-
-class TabularRegressor(TabularEstimator):
-    """TabularRegressor class.
-    It is used for tabular data regression with lightgbm regressor.
-    """
-    def __init__(self, path=None, verbose=True, time_limit=None):
-        super().__init__(path, verbose, time_limit)
-        self.objective = 'regression'
-
-    def get_skf(self, folds):
-        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001)
-
-class LgbmRegressor(TabularRegressor, LGBMMixIn):
-    def init_model(self, y):
-        self.estimator = LGBMRegressor(silent=False,
-                                  verbose=-1,
-                                  n_jobs=1,
-                                  objective=self.objective)
\ No newline at end of file
diff --git a/autokaggle/utils.py b/autokaggle/utils.py
index cfb3deb..31479c6 100644
--- a/autokaggle/utils.py
+++ b/autokaggle/utils.py
@@ -2,6 +2,7 @@
 import tempfile
 import string
 import random
+import json
 
 
 def ensure_dir(directory):
diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py
index cfe0b9f..b4fda77 100644
--- a/examples/tabular_classification_binary.py
+++ b/examples/tabular_classification_binary.py
@@ -1,5 +1,7 @@
 import numpy as np
-from autokaggle import AutoKaggle
+import sys
+sys.path.append("../autokaggle/")
+from autokaggle import *
 
 if __name__ == '__main__':
     ntime, nnum, ncat = 4, 10, 8
@@ -16,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKagggle()
+    clf = AutoKaggle()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py
index d62eb51..19284cd 100644
--- a/examples/tabular_classification_multiclass.py
+++ b/examples/tabular_classification_multiclass.py
@@ -1,5 +1,7 @@
 import numpy as np
-from autokaggle import TabularClassifier
+import sys
+sys.path.append("../autokaggle/")
+from autokaggle import *
 
 if __name__ == '__main__':
     ntime, nnum, ncat = 4, 10, 8
@@ -16,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKagggle()
+    clf = AutoKaggle()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index 7a47cc8..8c1ed50 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -1,5 +1,7 @@
 import numpy as np
-from autokaggle import TabularRegressor
+import sys
+sys.path.append("../autokaggle/")
+from autokaggle import *
 
 if __name__ == '__main__':
     ntime, nnum, ncat = 4, 10, 8
@@ -16,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKagggle(LgbmRegressor)
+    clf = AutoKaggle(LgbmRegressor)
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 

From 23ddedf46c299294dcab5dd7832b01f80bbed1d0 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 25 Jul 2019 13:23:54 -0500
Subject: [PATCH 03/31] Add Benchmarking script

---
 autokaggle/utils.py      |  14 +++-
 examples/benchmarking.py | 166 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 176 insertions(+), 4 deletions(-)
 create mode 100644 examples/benchmarking.py

diff --git a/autokaggle/utils.py b/autokaggle/utils.py
index 31479c6..e93f7d8 100644
--- a/autokaggle/utils.py
+++ b/autokaggle/utils.py
@@ -5,6 +5,10 @@
 import json
 
 
+def generate_rand_string(size):
+    chars = string.ascii_uppercase + string.digits
+    return ''.join(random.choice(chars) for _ in range(size))
+    
 def ensure_dir(directory):
     """Create directory if it does not exist."""
     if not os.path.exists(directory):
@@ -19,11 +23,8 @@ def temp_path_generator():
 
 def rand_temp_folder_generator():
     """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras)."""
-    chars = string.ascii_uppercase + string.digits
-    size = 6
-    random_suffix = ''.join(random.choice(chars) for _ in range(size))
     sys_temp = temp_path_generator()
-    path = sys_temp + '_' + random_suffix
+    path = sys_temp + '_' + generate_rand_string(6)
     ensure_dir(path)
     return path
 
@@ -34,3 +35,8 @@ def write_json(data, filename):
 def read_json(filename):
     with open(filename, 'rb') as infile:
         return json.load(infile)
+
+def write_csv(filename, line):
+    with open(filename, "a") as f:
+        f.write(", ".join(map(str, line)))
+        f.write("\n")
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
new file mode 100644
index 0000000..860b48b
--- /dev/null
+++ b/examples/benchmarking.py
@@ -0,0 +1,166 @@
+import numpy as np
+import pandas as pd
+import sklearn.model_selection
+import sklearn.datasets
+from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, balanced_accuracy_score,\
+mean_absolute_error, mean_squared_error
+from autosklearn.regression import AutoSklearnRegressor
+from autosklearn.classification import AutoSklearnClassifier
+from autokaggle import *
+from autokaggle.utils import *
+import openml
+openml.config.apikey = '3c7196c92a274c3b9405a7e26e9f848e'
+import warnings
+from abc import abstractmethod
+
+def generate_rand_string(size):
+    chars = string.ascii_uppercase + string.digits
+    return ''.join(random.choice(chars) for _ in range(size))
+
+class BenchmarkingBase:
+    results = None
+    cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", "balanced_accuracy", "F1_score", "AUC"]
+    rgs_desc = ["automl_model", "task_id", "time_limit", "MSE", "MAE", "R2_score"]
+    
+    def __init__(self, supress_warnings=True, sess_name=""):
+        if supress_warnings:
+            warnings.filterwarnings('ignore')
+        self.results = []
+        if not sess_name:
+            sess_name = generate_rand_string(6)
+        self.cls_results = pd.DataFrame(columns=self.cls_desc)
+        self.rgs_results = pd.DataFrame(columns=self.rgs_desc)
+        
+    def measure_performance_cls(self, y_true, y_pred, binary=False):
+        accuracy = accuracy_score(y_true, y_pred)
+        ber = balanced_accuracy_score(y_true, y_pred)
+        f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score(y_true, y_pred, average="weighted")
+        auc = roc_auc_score(y_true, y_pred) if binary else "-"
+        return [accuracy, ber, f1, auc]
+
+    def measure_performance_rgs(self, y_true, y_pred):
+        mse = mean_squared_error(y_true, y_pred)
+        mae = mean_absolute_error(y_true, y_pred)
+        r2 = r2_score(y_true, y_pred)
+        return [mse, mae, r2]
+    
+    def export_results(self):
+        self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False)
+        self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", index=False)
+    
+    @abstractmethod
+    def evaluate(self, task, time_limit):
+        pass
+        
+    def run_automation(self, task_list, time_limit=10*60):
+        for task in task_list:
+            try:
+                self.evaluate(task, time_limit=time_limit)
+            except:
+                print("task: {} didnt work".format(task))
+                
+    def time_lapse(self, task_id, time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]):
+        tl_results = []
+        for time_limit in time_limits:
+            tl_results.append(self.evaluate(task_id, time_limit=time_limit))
+        return tl_results
+    
+    
+class BenchmarkingAutoKaggle(BenchmarkingBase):
+    def get_data_info(self, dataset, num_cols):
+        nominal_feat = dataset.get_features_by_type('nominal')
+        numerical_feat = dataset.get_features_by_type('numeric')
+        string_feat = dataset.get_features_by_type('string')
+        date_feat = dataset.get_features_by_type('date')
+
+        data_info = []
+        for i in range(num_cols):
+            if i in date_feat:
+                data_info.append("TIM")
+            elif i in numerical_feat:
+                data_info.append("NUM")
+            else:
+                data_info.append("CAT")
+        return np.array(data_info)
+    
+    def evaluate(self, task_id, time_limit=10*60):
+        task_info = ["autokaggle", task_id, time_limit]
+        task = openml.tasks.get_task(task_id)
+        train_indices, test_indices = task.get_train_test_split_indices()
+        dataset = task.get_dataset()
+        X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array')
+
+        x_train, y_train = X[train_indices], y[train_indices]
+        x_test, y_test = X[test_indices], y[test_indices]
+
+        # Create feature type list from openml.org indicator
+        data_info = self.get_data_info(dataset, len(attribute_names))
+
+        # Train
+        if task.task_type == 'Supervised Classification':
+            automl = AutoKaggle()
+        elif task.task_type == 'Supervised Regression':
+            automl = AutoKaggle(LgbmRegressor)
+        else:
+            print("UNSUPPORTED TASK_TYPE")
+            assert(0)
+
+        automl.fit(x_train, y_train, time_limit=time_limit, data_info=data_info)
+
+        # Evaluate
+        y_hat = automl.predict(x_test)
+        
+        if task.task_type == 'Supervised Classification':
+            is_binary = True if len(task.class_labels) <= 2 else False
+            result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary)
+            self.cls_results.loc[len(self.cls_results)] = result
+        elif task.task_type == 'Supervised Regression':
+            result = task_info + self.measure_performance_rgs(y_test, y_hat)
+            self.rgs_results.loc[len(sel.rgs_results)] = result
+        print(result)
+        return result
+
+    
+class BenchmarkingAutoSklearn(BenchmarkingBase):
+    def get_data_info(self, categorical_indicator):
+        return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator]
+    
+    def evaluate(self, task_id, time_limit=10*60):
+        task_info = ["autosklearn", task_id, time_limit]
+        task = openml.tasks.get_task(task_id)
+        train_indices, test_indices = task.get_train_test_split_indices()
+        dataset = task.get_dataset()
+        X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array')
+
+        x_train, y_train = X[train_indices], y[train_indices]
+        x_test, y_test = X[test_indices], y[test_indices]
+
+        # Create feature type list from openml.org indicator
+        feat_type = self.get_data_info(categorical_indicator)
+
+        # Train
+        if task.task_type == 'Supervised Classification':
+            automl = AutoSklearnClassifier(
+                time_left_for_this_task=time_limit,
+                per_run_time_limit=time_limit//10, **kwargs)
+        elif task.task_type == 'Supervised Regression':
+            automl = AutoSklearnRegressor(
+                time_left_for_this_task=time_limit,
+                per_run_time_limit=time_limit//10, **kwargs)
+        else:
+            print("UNSUPPORTED TASK_TYPE")
+            assert(0)
+
+        automl.fit(x_train, y_train, feat_type=feat_type)
+
+        y_hat = automl.predict(x_test)
+        if task.task_type == 'Supervised Classification':
+            is_binary = True if len(task.class_labels) <= 2 else False
+            result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary)
+            self.cls_results.loc[len(self.cls_results)] = result
+        elif task.task_type == 'Supervised Regression':
+            result = task_info + self.measure_performance_rgs(y_test, y_hat)
+            self.rgs_results.loc[len(self.rgs_results)] = result
+        self.results.append(result)
+        print(result)
+        return result
\ No newline at end of file

From fadf0ab3172e756136f3fde828d8cfa4d73b780d Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 25 Jul 2019 14:39:30 -0500
Subject: [PATCH 04/31] Add estimators

---
 autokaggle/ensemblers.py           |  66 ++---------
 autokaggle/estimators.py           | 182 +++++++++++++++++++++++++++++
 autokaggle/hparam_space/rf_hp.json |   2 +-
 autokaggle/preprocessor.py         |  33 +++++-
 examples/benchmarking.py           | 122 +++++++++++++++++--
 5 files changed, 338 insertions(+), 67 deletions(-)
 create mode 100644 autokaggle/estimators.py

diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index 9081211..532072e 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -1,13 +1,13 @@
 from sklearn.base import BaseEstimator
-from tabular_preprocessor import TabularPreprocessor
-from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+from autokaggle.preprocessor import TabularPreprocessor
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 from abc import abstractmethod
 import numpy as np
 import os
 import random
 import json
+from statistics import mode
 
-from lightgbm import LGBMClassifier, LGBMRegressor
 from sklearn.model_selection import RandomizedSearchCV
 from sklearn.model_selection import StratifiedKFold, KFold
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
@@ -34,26 +34,11 @@ def predict(self, predictions):
             return predictions.apply(mode, axis=1).values
 
 class EnsembleSelection:
-    def __init__(self, task_type, metric, mode='fast'):
-        self.ensemble_size = None
-        self.mode = mode
-        self.task_type = task_type
-        self.metric = metric
+    def __init__(self, ensemble_size=25):
+        self.ensemble_size = ensemble_size
         
     def fit(self, predictions, y_true):
-        if self.mode == 'fast':
-            self.fast(predictions, y_true)
-        else:
-            self.slow(predictions, y_true)
-        self._calculate_weights()
-    
-    def predict(self, predictions):
-        return np.average(predictions, axis=0, weights=self.weights_)
-        
-    def fast(self, predictions, labels):
-        """Fast version of Rich Caruana's ensemble selection method."""
-        self.ensemble_size = len(predictions)
-
+        """Rich Caruana's ensemble selection method. (fast version)"""
         ensemble = []
         trajectory = []
         order = []
@@ -94,39 +79,6 @@ def fast(self, predictions, labels):
                 break
 
         self.indices_ = order
-
-    def slow(self, predictions, labels):
-        """Rich Caruana's ensemble selection method."""
-        self.ensemble_size = len(predictions)
-
-        ensemble = []
-        trajectory = []
-        order = []
-        
-        for i in range(self.ensemble_size):
-            scores = np.zeros([predictions.shape[0]])
-            for j, pred in enumerate(predictions):
-                ensemble.append(pred)
-                ensemble_prediction = np.mean(np.array(ensemble), axis=0)
-                scores[j] = calculate_score(
-                    solution=labels,
-                    prediction=ensemble_prediction,
-                    task_type=self.task_type,
-                    metric=self.metric,
-                    all_scoring_functions=False)
-                ensemble.pop()
-            best = np.nanargmin(scores)
-            ensemble.append(predictions[best])
-            trajectory.append(scores[best])
-            order.append(best)
-
-            # Handle special case
-            if len(predictions) == 1:
-                break
-
-        self.indices_ = np.array(order)
-        
-    def _calculate_weights(self):
         ensemble_members = Counter(self.indices_).most_common()
         weights = np.zeros((self.ensemble_size,), dtype=float)
         for ensemble_member in ensemble_members:
@@ -136,4 +88,8 @@ def _calculate_weights(self):
         if np.sum(weights) < 1:
             weights = weights / np.sum(weights)
 
-        self.weights_ = weights
\ No newline at end of file
+        self.weights_ = weights
+        self._calculate_weights()
+    
+    def predict(self, predictions):
+        return np.average(predictions, axis=1, weights=self.weights_)
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
new file mode 100644
index 0000000..d9907ac
--- /dev/null
+++ b/autokaggle/estimators.py
@@ -0,0 +1,182 @@
+from sklearn.base import BaseEstimator
+from abc import abstractmethod
+import numpy as np
+import os
+import random
+import json
+
+from lightgbm import LGBMClassifier, LGBMRegressor
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import StratifiedKFold, KFold
+from sklearn.svm import SVC
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
+from joblib import dump, load
+
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+
+class TabularEstimator(BaseEstimator):
+    def __init__(self, path=None, verbose=True, time_limit=None):
+        """
+        Initialization function for tabular supervised learner.
+        """
+        self.verbose = verbose
+        self.path = path
+        self.time_limit = time_limit
+        self.objective = None
+        abs_cwd = os.path.split(os.path.abspath(__file__))[0]
+        self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams)
+        self.clf = None
+        self.estimator = None
+    
+    def fit(self, x, y):
+        self.init_model(y)
+        self.search(x, y)
+        self.clf.fit(x, y)
+        self.save_model()
+    
+    def predict(self, x, y=None):
+        y = self.clf.predict(x, )
+        return y
+    
+    def search(self, x, y, search_iter=40, folds=3):
+        # Set small sample for hyper-param search
+        if x.shape[0] > 600:
+            grid_train_percentage = max(600.0 / x.shape[0], 0.1)
+        else:
+            grid_train_percentage = 1
+        grid_n = int(x.shape[0] * grid_train_percentage)
+        idx = random.sample(list(range(x.shape[0])), grid_n)
+        grid_train_x, grid_train_y = x[idx, :], y[idx]
+        
+        if type(self.hparams) != list:
+            self.hparams = [self.hparams]
+            
+        best_params = {}
+        for idx, search_space in enumerate(self.hparams):
+            best_params.update(search_space)
+            if self.verbose:
+                print("Step: {}".format(idx+1))
+                print("Search space:")
+                print(best_params)
+                score_metric, skf = self.get_skf(folds)
+            random_search = RandomizedSearchCV(self.estimator, param_distributions=best_params, n_iter=search_iter,
+                                       scoring=score_metric,
+                                       n_jobs=1, cv=skf, verbose=0, random_state=1001, iid=False)
+            random_search.fit(grid_train_x, grid_train_y)
+            best_params = random_search.best_params_
+            for key, value in best_params.items():
+                best_params[key] = [value]
+
+        self.clf = random_search.best_estimator_
+
+        return random_search.best_params_
+            
+    @abstractmethod
+    def save_model(self):
+        pass
+    
+    @abstractmethod
+    def init_model(self, y):
+        pass
+    
+    @abstractmethod
+    def get_skf(self, folds):
+        pass
+    
+    def __repr__(self):
+        return "TabularEstimator model"
+    
+    
+class Classifier(TabularEstimator):
+    """Classifier class.
+     It is used for tabular data classification with lightgbm classifier.
+    """ 
+    def __init__(self, path=None, verbose=True, time_limit=None):
+        super().__init__(path, verbose, time_limit)
+        self.objective = 'classification'
+
+    def get_skf(self, folds):
+        if self.objective == 'binary':
+            score_metric = 'roc_auc'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
+        else:
+            score_metric = 'f1_weighted'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
+        return score_metric, skf
+    
+    
+class Regressor(TabularEstimator):
+    """Regressor class.
+    It is used for tabular data regression with lightgbm regressor.
+    """
+    def __init__(self, path=None, verbose=True, time_limit=None):
+        super().__init__(path, verbose, time_limit)
+        self.objective = 'regression'
+
+    def get_skf(self, folds):
+        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001)
+    
+    
+class LGBMMixIn:
+    _default_hyperparams = "lgbm_hp.json"
+    
+    def save_model(self):
+        self.clf.booster_.save_model(self.save_filename)
+    
+    def get_feature_importance(self):
+        if self.estimator:
+            print('Feature Importance:')
+            print(self.clf.feature_importances_)
+            
+            
+class SklearnMixIn:
+    
+    def save_model(self):
+        dump(self.clf, self.save_filename)
+        
+    def load_model(self):
+        self.clf = load(self.save_filename)
+
+        
+class SVMClassifier(Classifier, SklearnMixIn):
+    _default_hyperparams = "svm_hp.json"
+        
+    def init_model(self, y):
+        n_classes = len(set(y))
+        self.objective = 'binary' if n_classes == 2 else 'multiclass'
+        self.estimator = SVC()
+
+        
+class RFClassifier(Classifier, SklearnMixIn):
+    _default_hyperparams = "rf_hp.json"
+        
+    def init_model(self, y):
+        n_classes = len(set(y))
+        self.objective = 'binary' if n_classes == 2 else 'multiclass'
+        self.estimator = RandomForestClassifier()
+        
+class LgbmClassifier(Classifier, LGBMMixIn):
+    def init_model(self, y):
+        n_classes = len(set(y))
+        if n_classes == 2:
+            self.objective = 'binary'
+            self.estimator = LGBMClassifier(silent=False,
+                                       verbose=-1,
+                                       n_jobs=1,
+                                       objective=self.objective)
+        else:
+            self.objective = 'multiclass'
+            self.estimator = LGBMClassifier(silent=False,
+                                       verbose=-1,
+                                       n_jobs=1,
+                                       num_class=n_classes,
+                                       objective=self.objective)
+
+            
+class LgbmRegressor(Regressor, LGBMMixIn):
+    def init_model(self, y):
+        self.estimator = LGBMRegressor(silent=False,
+                                  verbose=-1,
+                                  n_jobs=1,
+                                  objective=self.objective)
\ No newline at end of file
diff --git a/autokaggle/hparam_space/rf_hp.json b/autokaggle/hparam_space/rf_hp.json
index a56221a..c23c577 100644
--- a/autokaggle/hparam_space/rf_hp.json
+++ b/autokaggle/hparam_space/rf_hp.json
@@ -1 +1 @@
-{"criterion": ["entropy", "gini"], "max_features": [0.1, 0.3, 0.5, 0.7, 1.0], "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_split": [2, 3, 4], "min_samples_leaf": [1, 2, 3], "n_estimators": [50, 100, 150, 200]}
\ No newline at end of file
+[{"criterion": ["entropy", "gini"], "max_features": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "n_estimators": [300], "min_samples_leaf": [1]}, {"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {"n_estimators": [50, 100, 150, 200]}]
\ No newline at end of file
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 543e99a..9509261 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -1,6 +1,9 @@
 import numpy as np
 from pandas import DataFrame
+import scipy
 from scipy.stats import pearsonr
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
 
 LEVEL_HIGH = 32
 
@@ -96,8 +99,8 @@ def __init__(self):
         self.high_level_cat_keys = []
 
         self.feature_add_high_cat = 0
-        self.feature_add_cat_num = 0
-        self.feature_add_cat_cat = 0
+        self.feature_add_cat_num = 10
+        self.feature_add_cat_cat = 10
         self.order_num_cat_pair = {}
 
         self.selected_cols = None
@@ -106,6 +109,8 @@ def __init__(self):
         self.n_time = None
         self.n_num = None
         self.n_cat = None
+        self.pca = None
+        self.scaler = None
 
     def remove_useless(self, x):
         """
@@ -262,6 +267,9 @@ def fit(self, raw_x, y, time_limit, data_info):
         print('#TIME features: {}'.format(self.n_time))
         print('#NUM features: {}'.format(self.n_num))
         print('#CAT features: {}'.format(self.n_cat))
+        
+        # Convert sparse to dense if needed
+        raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
         raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'],
                  'NUM': raw_x[:, self.data_info == 'NUM'],
                  'CAT': raw_x[:, self.data_info == 'CAT']}
@@ -292,7 +300,17 @@ def fit(self, raw_x, y, time_limit, data_info):
 
         # Encode high-order categorical data to numerical with frequency
         x = self.cat_to_num(x, y)
-
+        
+        # Standardize numeric columns
+        if self.n_num > 0:
+            self.scaler = StandardScaler()
+            x[:, :self.n_num] = self.scaler.fit_transform(x[:, :self.n_num])
+
+            # PCA for feature generation
+            self.pca = PCA(n_components=0.99, svd_solver='full')
+            x_pca = self.pca.fit_transform(x[:, :self.n_num])
+            x = np.concatenate([x, x_pca], axis=1)
+        
         x = self.process_time(x)
         x = self.remove_useless(x)
 
@@ -318,6 +336,8 @@ def encode(self, raw_x, time_limit=None):
         else:
             self.budget = time_limit
 
+        # Convert sparse to dense if needed
+        raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
         raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'],
                  'NUM': raw_x[:, self.data_info == 'NUM'],
                  'CAT': raw_x[:, self.data_info == 'CAT']}
@@ -330,6 +350,13 @@ def encode(self, raw_x, time_limit=None):
         x = self.cat_to_num(x)
 
         x = self.process_time(x)
+        
+        if self.scaler:
+            x[:, :self.n_num] = self.scaler.transform(x[:, :self.n_num])
+        if self.pca:
+            x_pca = self.pca.transform(x[:, :self.n_num])
+            x = np.concatenate([x, x_pca], axis=1)
+        
         if self.selected_cols is not None:
             x = x[:, self.selected_cols]
         return x
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 860b48b..c62b1dc 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append("../")
 import numpy as np
 import pandas as pd
 import sklearn.model_selection
@@ -26,8 +28,7 @@ def __init__(self, supress_warnings=True, sess_name=""):
         if supress_warnings:
             warnings.filterwarnings('ignore')
         self.results = []
-        if not sess_name:
-            sess_name = generate_rand_string(6)
+        self.sess_name = generate_rand_string(6) if not sess_name else sess_name
         self.cls_results = pd.DataFrame(columns=self.cls_desc)
         self.rgs_results = pd.DataFrame(columns=self.rgs_desc)
         
@@ -45,8 +46,10 @@ def measure_performance_rgs(self, y_true, y_pred):
         return [mse, mae, r2]
     
     def export_results(self):
-        self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False)
-        self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", index=False)
+        if len(self.cls_results) > 0:
+            self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False)
+        if len(self.rgs_results) > 0:
+            self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", index=False)
     
     @abstractmethod
     def evaluate(self, task, time_limit):
@@ -65,8 +68,23 @@ def time_lapse(self, task_id, time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 24
             tl_results.append(self.evaluate(task_id, time_limit=time_limit))
         return tl_results
     
+    def get_dataset_splits(self, task_id):
+        task = openml.tasks.get_task(task_id)
+        train_indices, test_indices = task.get_train_test_split_indices()
+        dataset = task.get_dataset()
+        X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array')
+
+        x_train, y_train = X[train_indices], y[train_indices]
+        x_test, y_test = X[test_indices], y[test_indices]
+        return x_train, y_train, x_test, y_test
+    
     
 class BenchmarkingAutoKaggle(BenchmarkingBase):
+    estimator_type = None
+    
+    def set_estimator_type(self, est_type):
+        self.estimator_type = est_type
+        
     def get_data_info(self, dataset, num_cols):
         nominal_feat = dataset.get_features_by_type('nominal')
         numerical_feat = dataset.get_features_by_type('numeric')
@@ -98,9 +116,9 @@ def evaluate(self, task_id, time_limit=10*60):
 
         # Train
         if task.task_type == 'Supervised Classification':
-            automl = AutoKaggle()
+            automl = AutoKaggle() if not self.estimator_type else AutoKaggle(self.estimator_type)
         elif task.task_type == 'Supervised Regression':
-            automl = AutoKaggle(LgbmRegressor)
+            automl = AutoKaggle(LgbmRegressor) if not self.estimator_type else AutoKaggle(self.estimator_type)
         else:
             print("UNSUPPORTED TASK_TYPE")
             assert(0)
@@ -116,7 +134,7 @@ def evaluate(self, task_id, time_limit=10*60):
             self.cls_results.loc[len(self.cls_results)] = result
         elif task.task_type == 'Supervised Regression':
             result = task_info + self.measure_performance_rgs(y_test, y_hat)
-            self.rgs_results.loc[len(sel.rgs_results)] = result
+            self.rgs_results.loc[len(self.rgs_results)] = result
         print(result)
         return result
 
@@ -163,4 +181,92 @@ def evaluate(self, task_id, time_limit=10*60):
             self.rgs_results.loc[len(self.rgs_results)] = result
         self.results.append(result)
         print(result)
-        return result
\ No newline at end of file
+        return result
+    
+def get_dataset_ids(task_ids):
+    if type(task_ids) == list:
+        return  [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids]
+    else:
+        return  openml.tasks.get_task(task_ids).dataset_id
+
+def get_task_info(task_ids):
+    task_types = []
+    dataset_list = []
+    for i, t_id in enumerate(task_ids):
+        task = openml.tasks.get_task(t_id)
+        dataset = openml.datasets.get_dataset(task.dataset_id)
+        if task.task_type_id == 1:
+            _, y, _, _ = dataset.get_data(target=task.target_name, dataset_format='array')
+            task_type = "Binary Classification" if len(set(y)) <= 2 else "Multiclass classification ({})".format(len(set(y)))
+        else:
+            task_type = "Regression"
+        task_types.append(task_type)
+        dataset_list.append(dataset)
+    return dataset_list, task_types
+
+def get_dataset_properties(task_ids):
+    dataset_list, task_types = get_task_info(task_ids)
+    df = pd.DataFrame(columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", "#Date"])
+    for i, dataset in enumerate(dataset_list):
+        df.loc[i] = [
+            dataset.name, 
+            dataset.qualities["NumberOfInstances"],
+            task_types[i],
+            len(dataset.get_features_by_type('numeric')), 
+            len(dataset.get_features_by_type('nominal')),
+            len(dataset.get_features_by_type('string')),
+            len(dataset.get_features_by_type('date')),
+        ]
+    return df
+
+def get_performance_table(filename, metric):
+    """
+    Read the results csv and convert into the performance table based on the median of the results for each task.
+    """
+    test = pd.read_csv(filename)
+    perf = pd.DataFrame(columns=["Name", "AutoKaggle", "AutoSklearn", "H2O.ai"])
+    task_ids = list(set(test["task_id"]))
+    dataset_ids = get_dataset_ids(task_ids)
+    
+    test = test.set_index(["task_id", "automl_model"])
+    test.sort_index(inplace=True)
+    for i, t_id in enumerate(task_ids):
+        try:
+            name = openml.datasets.get_dataset(dataset_ids[i]).name
+            auto_kaggle = test.loc[(t_id, "autokaggle")][metric].median() if (t_id, "autokaggle") in test.index else np.nan
+            auto_sklearn = test.loc[(t_id, "autosklearn")][metric].median() if (t_id, "autosklearn") in test.index else np.nan
+            h2o_ai = test.loc[(t_id, "autosklearn")][metric].median() if (t_id, "autosklearn") in test.index else np.nan
+            perf.loc[i] = [name, auto_kaggle, auto_sklearn, h2o_ai]
+        except Exception as e:
+            print(e)
+    return perf
+
+def style_results(res):
+    """
+    Highlight the max results and set index to name
+    """
+    def highlight_max(s):
+        '''
+        highlight the maximum in a Series yellow.
+        '''
+        is_max = s == s.max()
+        return ['background-color: yellow' if v else '' for v in is_max]
+    res = res.set_index("Name")
+    res.style.apply(highlight_max, axis=1)
+    return res
+
+import statistics
+def get_box_plot(data, task_id, metric):
+    """
+    Plots the boxplot of variance
+    """
+    auto_sklearn = list(data.loc[(task_id, "autosklearn")][metric])
+    auto_kaggle = list(data.loc[(task_id, "autokaggle")][metric])
+    med_sk = statistics.median(auto_sklearn)
+    med_ak = statistics.median(auto_kaggle)
+    while len(auto_sklearn) < len(auto_kaggle):
+        auto_sklearn.append(med_sk)
+    while len(auto_sklearn) > len(auto_kaggle):
+        auto_kaggle.append(med_ak)
+    temp = pd.DataFrame(data={"Autokaggle":auto_kaggle, "AutoSklearn":auto_sklearn})
+    temp.boxplot()

From 1f800e1902fbe6efe522b62c6572654e51d3b1a8 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 29 Aug 2019 13:54:54 -0500
Subject: [PATCH 05/31] Fixed the tests

---
 autokaggle/auto_ml.py                         | 14 +++++++-------
 autokaggle/estimators.py                      | 12 +++++++++---
 examples/tabular_classification_binary.py     |  3 ++-
 examples/tabular_classification_multiclass.py |  3 ++-
 examples/tabular_regression.py                |  4 ++--
 5 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index e2936e5..6302d32 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -11,17 +11,17 @@
 from autokaggle.estimators import *
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 
+
 class AutoKaggle(BaseEstimator):
-    def __init__(self, estimator_class=LgbmClassifier, path=None, verbose=True):
+    def __init__(self, estimator=LgbmClassifier(), preprocessor=TabularPreprocessor(), path=None, verbose=True):
         """
         Initialization function for tabular supervised learner.
         """
         self.verbose = verbose
         self.is_trained = False
         self.objective = None
-        self.preprocessor = None
-        self.model = None
-        self.estimator_class = estimator_class
+        self.preprocessor = preprocessor
+        self.model = estimator
         self.path = path if path is not None else rand_temp_folder_generator()
         ensure_dir(self.path)
         if self.verbose:
@@ -56,9 +56,9 @@ def fit(self, x, y, time_limit=None, data_info=None):
             x = np.concatenate([x, x], axis=0)
             y = np.concatenate([y, y], axis=0)
         
-        # Init model and preprocessor
-        self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit)
-        self.preprocessor = TabularPreprocessor()
+        # # Init model and preprocessor
+        # self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit)
+        # self.preprocessor = TabularPreprocessor()
             
         # Fit Model and preprocessor
         x = self.preprocessor.fit(x, y, self.time_limit, data_info)
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index d9907ac..0d4e73d 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -15,6 +15,7 @@
 
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 
+
 class TabularEstimator(BaseEstimator):
     def __init__(self, path=None, verbose=True, time_limit=None):
         """
@@ -39,16 +40,21 @@ def predict(self, x, y=None):
         y = self.clf.predict(x, )
         return y
     
-    def search(self, x, y, search_iter=40, folds=3):
+    @staticmethod
+    def subsample(x, y, sample_percent):
         # Set small sample for hyper-param search
         if x.shape[0] > 600:
-            grid_train_percentage = max(600.0 / x.shape[0], 0.1)
+            grid_train_percentage = max(600.0 / x.shape[0], sample_percent)
         else:
             grid_train_percentage = 1
         grid_n = int(x.shape[0] * grid_train_percentage)
         idx = random.sample(list(range(x.shape[0])), grid_n)
         grid_train_x, grid_train_y = x[idx, :], y[idx]
-        
+        return grid_train_x, grid_train_y
+
+    def search(self, x, y, search_iter=40, folds=3):
+        grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1)
+
         if type(self.hparams) != list:
             self.hparams = [self.hparams]
             
diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py
index b4fda77..0ebd91f 100644
--- a/examples/tabular_classification_binary.py
+++ b/examples/tabular_classification_binary.py
@@ -1,6 +1,7 @@
 import numpy as np
 import sys
-sys.path.append("../autokaggle/")
+
+sys.path.append("..")
 from autokaggle import *
 
 if __name__ == '__main__':
diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py
index 19284cd..13612f6 100644
--- a/examples/tabular_classification_multiclass.py
+++ b/examples/tabular_classification_multiclass.py
@@ -1,6 +1,7 @@
 import numpy as np
 import sys
-sys.path.append("../autokaggle/")
+sys.path.append("..")
+# print(sys.path)
 from autokaggle import *
 
 if __name__ == '__main__':
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index 8c1ed50..ccb2064 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -1,6 +1,6 @@
 import numpy as np
 import sys
-sys.path.append("../autokaggle/")
+sys.path.append("..")
 from autokaggle import *
 
 if __name__ == '__main__':
@@ -18,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggle(LgbmRegressor)
+    clf = AutoKaggle(LgbmRegressor())
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 

From a7b66c332f6c96dee2d0932fdefc5f75a9fc1b80 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 29 Aug 2019 20:42:27 -0500
Subject: [PATCH 06/31] Use hyper-opt for search

---
 autokaggle/auto_ml.py      |  8 +--
 autokaggle/estimators.py   | 99 ++++++++++++++++++++++++++------------
 autokaggle/preprocessor.py |  8 +--
 3 files changed, 75 insertions(+), 40 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 6302d32..c9689c3 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -44,10 +44,7 @@ def fit(self, x, y, time_limit=None, data_info=None):
         you should warm-start your training from the pre-trained model. Past data will
         NOT be available for re-training.
         """
-
-        if time_limit is None:
-            time_limit = 24 * 60 * 60
-        self.time_limit = time_limit
+        self.time_limit = time_limit if time_limit else 24 * 60 * 60
         
         if x.shape[1] == 0:
             raise ValueError("No feature exist!")
@@ -63,7 +60,6 @@ def fit(self, x, y, time_limit=None, data_info=None):
         # Fit Model and preprocessor
         x = self.preprocessor.fit(x, y, self.time_limit, data_info)
         self.model.fit(x, y)
-        self.model.save_model()
         self.is_trained = True
 
         if self.verbose:
@@ -73,7 +69,7 @@ def fit(self, x, y, time_limit=None, data_info=None):
     def predict(self, x_test):
         """
         This function should provide predictions of labels on (test) data.
-        The function predict eventually casdn return probabilities or continuous values.
+        The function predict eventually can return probabilities or continuous values.
         """
         x_test = self.preprocessor.encode(x_test)
         y = self.model.predict(x_test, )
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 0d4e73d..0576430 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -6,14 +6,17 @@
 import json
 
 from lightgbm import LGBMClassifier, LGBMRegressor
-from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import RandomizedSearchCV, cross_val_score
 from sklearn.model_selection import StratifiedKFold, KFold
 from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
+from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer
 from joblib import dump, load
 
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+import hyperopt
+from hyperopt import tpe, hp, fmin, space_eval
 
 
 class TabularEstimator(BaseEstimator):
@@ -27,17 +30,16 @@ def __init__(self, path=None, verbose=True, time_limit=None):
         self.objective = None
         abs_cwd = os.path.split(os.path.abspath(__file__))[0]
         self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams)
-        self.clf = None
-        self.estimator = None
+        self.best_estimator_ = None
     
     def fit(self, x, y):
         self.init_model(y)
         self.search(x, y)
-        self.clf.fit(x, y)
+        self.best_estimator_.fit(x, y)
         self.save_model()
     
     def predict(self, x, y=None):
-        y = self.clf.predict(x, )
+        y = self.best_estimator_.predict(x, )
         return y
     
     @staticmethod
@@ -54,29 +56,64 @@ def subsample(x, y, sample_percent):
 
     def search(self, x, y, search_iter=40, folds=3):
         grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1)
+        score_metric, skf = self.get_skf(folds)
 
-        if type(self.hparams) != list:
-            self.hparams = [self.hparams]
-            
-        best_params = {}
-        for idx, search_space in enumerate(self.hparams):
-            best_params.update(search_space)
-            if self.verbose:
-                print("Step: {}".format(idx+1))
-                print("Search space:")
-                print(best_params)
-                score_metric, skf = self.get_skf(folds)
-            random_search = RandomizedSearchCV(self.estimator, param_distributions=best_params, n_iter=search_iter,
-                                       scoring=score_metric,
-                                       n_jobs=1, cv=skf, verbose=0, random_state=1001, iid=False)
-            random_search.fit(grid_train_x, grid_train_y)
-            best_params = random_search.best_params_
-            for key, value in best_params.items():
-                best_params[key] = [value]
+        self.hparams = space = hp.choice('classifier', [
+            {'model': KNeighborsClassifier,
+             'param': {'n_neighbors':
+                           hp.choice('n_neighbors', range(3, 11)),
+                       'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree']),
+                       'leaf_size': hp.choice('leaf_size', range(1, 50)),
+                       'metric': hp.choice('metric', ["euclidean", "manhattan",
+                                                      "chebyshev", "minkowski"
+                                                      ])}
+             },
+            {'model': SVC,
+             'param': {'C': hp.lognormal('C', 0, 1),
+                       'kernel': hp.choice('kernel', ['rbf', 'poly', 'rbf', 'sigmoid']),
+                       'degree': hp.choice('degree', range(1, 15)),
+                       'gamma': hp.uniform('gamma', 0.001, 10000)}
+             }
+        ])
+
+        def objective_func(args):
+            clf = args['model'](**args['param'])
+            loss = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
+            print("CV Score:", loss)
+            print("\n=================")
+            return 1 - loss
+
+        opt = space_eval(self.hparams, fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest,
+                                            max_evals=search_iter))
+        self.best_estimator_ = opt['model'](**opt['param'])
 
-        self.clf = random_search.best_estimator_
+        return opt
 
-        return random_search.best_params_
+    # def search(self, x, y, search_iter=40, folds=3):
+    #     grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1)
+    #
+    #     if type(self.hparams) != list:
+    #         self.hparams = [self.hparams]
+    #
+    #     best_params = {}
+    #     for idx, search_space in enumerate(self.hparams):
+    #         best_params.update(search_space)
+    #         if self.verbose:
+    #             print("Step: {}".format(idx+1))
+    #             print("Search space:")
+    #             print(best_params)
+    #             score_metric, skf = self.get_skf(folds)
+    #         random_search = RandomizedSearchCV(self.estimator, param_distributions=best_params, n_iter=search_iter,
+    #                                    scoring=score_metric,
+    #                                    n_jobs=1, cv=skf, verbose=0, random_state=1001, iid=False)
+    #         random_search.fit(grid_train_x, grid_train_y)
+    #         best_params = random_search.best_params_
+    #         for key, value in best_params.items():
+    #             best_params[key] = [value]
+    #
+    #     self.best_estimator_ = random_search.best_estimator_
+    #
+    #     return random_search.best_params_
             
     @abstractmethod
     def save_model(self):
@@ -128,21 +165,21 @@ class LGBMMixIn:
     _default_hyperparams = "lgbm_hp.json"
     
     def save_model(self):
-        self.clf.booster_.save_model(self.save_filename)
+        self.best_estimator_.booster_.save_model(self.save_filename)
     
     def get_feature_importance(self):
-        if self.estimator:
+        if self.best_estimator_:
             print('Feature Importance:')
-            print(self.clf.feature_importances_)
+            print(self.best_estimator_.feature_importances_)
             
             
 class SklearnMixIn:
     
     def save_model(self):
-        dump(self.clf, self.save_filename)
+        dump(self.best_estimator_, self.save_filename)
         
     def load_model(self):
-        self.clf = load(self.save_filename)
+        self.best_estimator_ = load(self.save_filename)
 
         
 class SVMClassifier(Classifier, SklearnMixIn):
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 9509261..a9d56bf 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -253,15 +253,15 @@ def fit(self, raw_x, y, time_limit, data_info):
                      'TIME' for temporal feature, 'NUM' for other numerical feature,
                      and 'CAT' for categorical feature.
         """
-        # Get Meta-Feature
         self.budget = time_limit
+        # Extract or read data info
         self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x)
         print('DATA_INFO: {}'.format(self.data_info))
 
+        # Set the meta info for each data type
         self.n_time = sum(self.data_info == 'TIME')
         self.n_num = sum(self.data_info == 'NUM')
         self.n_cat = sum(self.data_info == 'CAT')
-
         self.total_samples = raw_x.shape[0]
 
         print('#TIME features: {}'.format(self.n_time))
@@ -270,11 +270,13 @@ def fit(self, raw_x, y, time_limit, data_info):
         
         # Convert sparse to dense if needed
         raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
+
+        # convert to a dictionary of different datatype
         raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'],
                  'NUM': raw_x[:, self.data_info == 'NUM'],
                  'CAT': raw_x[:, self.data_info == 'CAT']}
 
-
+        # Init the label encoders for each categorical column
         for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat):
             self.cat_to_int_label[col_index] = {}
 

From 27ed4b8e39a153586124de01d4d13088d8abd1c3 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Mon, 2 Sep 2019 17:56:33 -0500
Subject: [PATCH 07/31] Added code for primitives

---
 autokaggle/auto_ml.py      |   9 +-
 autokaggle/preprocessor.py | 538 ++++++++++++++++++-------------------
 2 files changed, 265 insertions(+), 282 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index c9689c3..f77e004 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -58,7 +58,8 @@ def fit(self, x, y, time_limit=None, data_info=None):
         # self.preprocessor = TabularPreprocessor()
             
         # Fit Model and preprocessor
-        x = self.preprocessor.fit(x, y, self.time_limit, data_info)
+        self.preprocessor.fit(x, y, self.time_limit, data_info)
+        x = self.preprocessor.transform(x)
         self.model.fit(x, y)
         self.is_trained = True
 
@@ -71,7 +72,7 @@ def predict(self, x_test):
         This function should provide predictions of labels on (test) data.
         The function predict eventually can return probabilities or continuous values.
         """
-        x_test = self.preprocessor.encode(x_test)
+        x_test = self.preprocessor.transform(x_test)
         y = self.model.predict(x_test, )
         if y is None:
             raise ValueError("Tabular predictor does not exist")
@@ -91,5 +92,5 @@ def evaluate(self, x_test, y_test):
         return results
 
     def final_fit(self, x_train, y_train):
-        x_train = self.preprocessor.encode(x_train)
-        self.model.fit(x_train, y_train)
\ No newline at end of file
+        x_train = self.preprocessor.transform(x_train)
+        self.model.fit(x_train, y_train)
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index a9d56bf..f155a73 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -4,87 +4,16 @@
 from scipy.stats import pearsonr
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler
-
+from sklearn.base import TransformerMixin
+from sklearn.base import BaseEstimator
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from abc import abstractmethod
+import collections
+from lightgbm import LGBMClassifier, LGBMRegressor
 LEVEL_HIGH = 32
 
 
-def parallel_function(labels, first_batch_keys, task):
-    if task == 'label':
-        if min(labels) > first_batch_keys:
-            labels = labels - np.min(labels)
-        return labels.reshape(labels.shape[0], 1)
-
-    elif task == 'frequency':
-        cat_dict = {}
-        n_rows = labels.shape[0]
-        labels = np.expand_dims(labels, axis=1)
-
-        if min(labels) > first_batch_keys:
-            labels = labels - np.min(labels)
-
-        frequencies = np.zeros((n_rows, 1))
-
-        for row_index in range(n_rows):
-            key = labels[row_index, 0]
-            if key in cat_dict:
-                cat_dict[key] += 1
-            else:
-                cat_dict[key] = 1
-
-        n_level = len(cat_dict)
-        key_to_frequency = {}
-
-        for key in cat_dict.keys():
-            key_to_frequency[key] = cat_dict[key] / n_rows * n_level
-
-        for row_index in range(n_rows):
-            key = labels[row_index, 0]
-            frequencies[row_index][0] = key_to_frequency[key]
-
-        return frequencies
-    elif task == 'num_cat':
-        df = DataFrame(data=labels)
-        return df.join(df.groupby(1)[0].mean(),
-                       rsuffix='r',
-                       on=1).values[:, -1:]
-    elif task == 'cat_cat':
-        df = DataFrame(data=labels)
-        df[3] = list(range(len(labels)))
-        return df.join(df.groupby([0, 1]).count(),
-                       rsuffix='r',
-                       on=(0, 1)).values[:, -1:]
-    elif task == 'train_num_cat':
-        y = first_batch_keys[0]
-        df = DataFrame(data=labels)
-        fe = df.join(df.groupby(1)[0].mean(),
-                     rsuffix='r',
-                     on=1).values[:, -1:]
-        mu = abs(pearsonr(np.squeeze(np.array(fe)), y)[0])
-        if np.isnan(mu):
-            mu = 0
-        return [[first_batch_keys[1], first_batch_keys[2], mu, mu], first_batch_keys[3]]
-
-    elif task == 'train_cat_cat':
-        y = first_batch_keys[0]
-        df = DataFrame(data=labels)
-        df[3] = list(range(len(labels)))
-        fe = df.join(df.groupby([0, 1]).count(),
-                     rsuffix='r',
-                     on=(0, 1)).values[:, -1:]
-        mu = abs(pearsonr(np.squeeze(np.array(fe)), y)[0])
-        if np.isnan(mu):
-            mu = 0
-        return [[first_batch_keys[1], first_batch_keys[2], mu], first_batch_keys[3]]
-    return None
-
-
-def call_parallel(tasks):
-    results = []
-    for t in tasks:
-        results.append(parallel_function(t[0], t[1], t[2]))
-    return results
-
-
 class TabularPreprocessor:
     def __init__(self):
         """
@@ -109,136 +38,10 @@ def __init__(self):
         self.n_time = None
         self.n_num = None
         self.n_cat = None
-        self.pca = None
-        self.scaler = None
-
-    def remove_useless(self, x):
-        """
-        Remove the columns whose max_value == min_value
-        """
-        self.selected_cols = np.where(np.max(x, axis=0) - np.min(x, axis=0) != 0)[0]
-        return x[:, self.selected_cols]
-
-    def process_time(self, x):
-        """
-        Process the TIME features. Add the difference between consecutive columns as a feature.
-        """
-        cols = range(self.n_time)
-        if len(cols) > 10:
-            cols = cols[:10]
-        x_time = x[:, cols]
-        for i in cols:
-            for j in range(i + 1, len(cols)):
-                x = np.append(x, np.expand_dims(x_time[:, i] - x_time[:, j], 1), 1)
-        return x
-
-    def extract_data(self, raw_x):
-        """
-        Encodes the catgorical data and returns all numeric values
-        """
-        # only get numerical variables
-        ret = np.concatenate([raw_x['TIME'], raw_x['NUM'], raw_x['CAT']], axis=1)
-        n_rows = ret.shape[0]
-        n_num_col = ret.shape[1] - self.n_cat
-
-        n_cat_col = self.n_cat
-        if n_cat_col <= 0:
-            return ret.astype(np.float64)
-
-        # preprocess (multi-value) categorical data
-        for col_index in range(n_num_col, n_num_col + n_cat_col):
-            for row_index in range(n_rows):
-                key = str(ret[row_index, col_index])
-                if key in self.cat_to_int_label[col_index]:
-                    ret[row_index, col_index] = self.cat_to_int_label[col_index][key]
-                    continue
-                new_value = len(self.cat_to_int_label[col_index])
-                self.cat_to_int_label[col_index][key] = new_value
-                ret[row_index, col_index] = new_value
-
-        return ret.astype(np.float64)
-
-    def cat_to_num(self, x, y=None):
-        if y is not None:
-            mark = self.n_time + self.n_num
-
-            for col_index in range(self.n_time + self.n_num, self.n_time + self.n_num + self.n_cat):
-                if self.n_first_batch_keys[col_index] <= LEVEL_HIGH:
-                    self.num_cat_pair[mark] = (col_index,)
-                    mark += 1
-                else:
-                    self.num_cat_pair[mark] = (col_index, col_index)
-                    mark += 1
-
-            mark_1 = 0
-            tasks = []
-            for i, cat_col_index1 in enumerate(self.high_level_cat_keys):
-                for cat_col_index2 in self.high_level_cat_keys[i + 1:]:
-                    tasks.append((x[:, (cat_col_index1, cat_col_index2)],
-                                  [y, cat_col_index1, cat_col_index2, mark_1],
-                                  'train_cat_cat'))
-                    mark_1 += 1
-
-            all_results = call_parallel(tasks)
-
-            num_cat_pair_1 = {}
-            pearsonr_dict_1 = {}
-            for result in all_results:
-                if result[0][-1] > 0.001:
-                    pearsonr_dict_1[result[1]] = result[0][-1]
-                    num_cat_pair_1[result[1]] = result[0]
-            pearsonr_high_1 = sorted(pearsonr_dict_1, key=pearsonr_dict_1.get, reverse=True)[:self.feature_add_cat_cat]
-            num_cat_pair_1 = {key: num_cat_pair_1[key] for key in pearsonr_high_1}
-            num_cat_pair_1 = {i + mark: num_cat_pair_1[key] for i, key in enumerate(num_cat_pair_1)}
-            self.num_cat_pair.update(num_cat_pair_1)
-            mark += len(pearsonr_high_1)
-
-            mark_2 = 0
-            tasks_2 = []
-            for cat_col_index in self.high_level_cat_keys:
-                for num_col_index in range(self.n_time, self.n_time + self.n_num):
-                    tasks_2.append((x[:, (num_col_index, cat_col_index)],
-                                    [y, num_col_index, cat_col_index, mark_2],
-                                    'train_num_cat'))
-                    mark_2 += 1
-
-            all_results = call_parallel(tasks_2)
-
-            num_cat_pair_2 = {}
-            pearsonr_dict_2 = {}
-            for result in all_results:
-                if result[0][-1] > 0.001:
-                    pearsonr_dict_2[result[1]] = result[0][-1]
-                    num_cat_pair_2[result[1]] = result[0]
-            pearsonr_high_2 = sorted(pearsonr_dict_2, key=pearsonr_dict_2.get, reverse=True)[:self.feature_add_cat_num]
-            num_cat_pair_2 = {key: num_cat_pair_2[key] for key in pearsonr_high_2}
-            num_cat_pair_2 = {i + mark: num_cat_pair_2[key] for i, key in enumerate(num_cat_pair_2)}
-            self.num_cat_pair.update(num_cat_pair_2)
-            self.order_num_cat_pair = sorted(list(self.num_cat_pair.keys()))
-            print('num_cat_pair_2:', num_cat_pair_2)
-
-        tasks = []
-        for key in self.order_num_cat_pair:
-            if len(self.num_cat_pair[key]) == 1:
-                (col_index,) = self.num_cat_pair[key]
-                tasks.append((x[:, col_index], self.n_first_batch_keys[col_index], 'label'))
-            if len(self.num_cat_pair[key]) == 2:
-                (col_index, col_index) = self.num_cat_pair[key]
-                tasks.append((x[:, col_index], self.n_first_batch_keys[col_index], 'frequency'))
-            if len(self.num_cat_pair[key]) == 3:
-                (cat_col_index1, cat_col_index2, mu) = self.num_cat_pair[key]
-                tasks.append((x[:, (cat_col_index1,
-                                    cat_col_index2)], self.n_first_batch_keys[cat_col_index1], 'cat_cat'))
-            elif len(self.num_cat_pair[key]) == 4:
-                (num_col_index, cat_col_index, mu, a) = self.num_cat_pair[key]
-                tasks.append((x[:, (num_col_index, cat_col_index)], self.n_first_batch_keys[cat_col_index], 'num_cat'))
-
-        results = call_parallel(tasks)
-        all_num = x.shape[1] - self.n_cat
-        results = [x[:, :all_num]] + results
-        ret = np.concatenate(results, axis=1)
-
-        return ret
+        self.cat_col = None
+        self.num_col = None
+        self.time_col = None
+        self.pipeline = None
 
     def fit(self, raw_x, y, time_limit, data_info):
         """
@@ -264,6 +67,10 @@ def fit(self, raw_x, y, time_limit, data_info):
         self.n_cat = sum(self.data_info == 'CAT')
         self.total_samples = raw_x.shape[0]
 
+        self.cat_col = list(np.where(self.data_info == 'CAT')[0])
+        self.num_col = list(np.where(self.data_info == 'NUM')[0])
+        self.time_col = list(np.where(self.data_info == 'TIME')[0])
+
         print('#TIME features: {}'.format(self.n_time))
         print('#NUM features: {}'.format(self.n_num))
         print('#CAT features: {}'.format(self.n_cat))
@@ -271,54 +78,21 @@ def fit(self, raw_x, y, time_limit, data_info):
         # Convert sparse to dense if needed
         raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
 
-        # convert to a dictionary of different datatype
-        raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'],
-                 'NUM': raw_x[:, self.data_info == 'NUM'],
-                 'CAT': raw_x[:, self.data_info == 'CAT']}
-
-        # Init the label encoders for each categorical column
-        for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat):
-            self.cat_to_int_label[col_index] = {}
-
-        x = self.extract_data(raw_x)
-
-        d_size = x.shape[0] * x.shape[1] / self.budget
-        if d_size > 35000:
-            self.feature_add_high_cat = 0
-        else:
-            self.feature_add_high_cat = 10
-
-        # Iterate cat features
-        for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat):
-            self.n_first_batch_keys[col_index] = len(self.cat_to_int_label[col_index])
-        high_level_cat_keys_tmp = sorted(self.n_first_batch_keys, key=self.n_first_batch_keys.get, reverse=True)[
-                                  :self.feature_add_high_cat]
-        for i in high_level_cat_keys_tmp:
-            if self.n_first_batch_keys[i] > 1e2:
-                self.high_level_cat_keys.append(i)
-
-        # Convert NaN to zeros
-        x = np.nan_to_num(x)
-
-        # Encode high-order categorical data to numerical with frequency
-        x = self.cat_to_num(x, y)
-        
-        # Standardize numeric columns
-        if self.n_num > 0:
-            self.scaler = StandardScaler()
-            x[:, :self.n_num] = self.scaler.fit_transform(x[:, :self.n_num])
-
-            # PCA for feature generation
-            self.pca = PCA(n_components=0.99, svd_solver='full')
-            x_pca = self.pca.fit_transform(x[:, :self.n_num])
-            x = np.concatenate([x, x_pca], axis=1)
-        
-        x = self.process_time(x)
-        x = self.remove_useless(x)
-
-        return x
-
-    def encode(self, raw_x, time_limit=None):
+        self.pipeline = Pipeline([
+            ('label_encoder', CatEncoder(selected_columns=self.cat_col)),
+            ('imputer', Imputation(selected_columns=self.cat_col + self.num_col + self.time_col)),
+            ('scaler', TabScaler(selected_columns=self.num_col)),
+            ('pca', TabPCA(selected_columns=self.num_col)),
+            ('time_diff', TimeDiff(selected_columns=self.time_col)),
+            ('filter', FilterConstant(selected_columns=self.time_col + self.num_col + self.cat_col)),
+            ('pearson_corr', FeatureFilter(selected_columns=self.time_col + self.num_col + self.cat_col)),
+            ('lgbm_feat_selection', FeatureImportance(selected_columns=self.time_col + self.num_col + self.cat_col)),
+        ])
+        self.pipeline.fit(raw_x, y)
+
+        return self
+
+    def transform(self, raw_x, time_limit=None):
         """
         This function should train the model parameters.
 
@@ -340,28 +114,7 @@ def encode(self, raw_x, time_limit=None):
 
         # Convert sparse to dense if needed
         raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
-        raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'],
-                 'NUM': raw_x[:, self.data_info == 'NUM'],
-                 'CAT': raw_x[:, self.data_info == 'CAT']}
-        x = self.extract_data(raw_x)
-
-        # Convert NaN to zeros
-        x = np.nan_to_num(x)
-
-        # Encode high-order categorical data to numerical with frequency
-        x = self.cat_to_num(x)
-
-        x = self.process_time(x)
-        
-        if self.scaler:
-            x[:, :self.n_num] = self.scaler.transform(x[:, :self.n_num])
-        if self.pca:
-            x_pca = self.pca.transform(x[:, :self.n_num])
-            x = np.concatenate([x, x_pca], axis=1)
-        
-        if self.selected_cols is not None:
-            x = x[:, self.selected_cols]
-        return x
+        return self.pipeline.transform(raw_x)
 
     @staticmethod
     def extract_data_info(raw_x):
@@ -380,3 +133,232 @@ def extract_data_info(raw_x):
             except:
                 data_info.append('CAT')
         return np.array(data_info)
+
+
+class TabularData:
+    def __init__(self, data, data_info):
+        self.data = data
+        self.data_info = data_info
+
+        self.total_samples = 0
+
+        self.cat_to_int_label = {}
+        self.n_first_batch_keys = {}
+        self.high_level_cat_keys = []
+
+        self.num_cat_pair = {}
+        self.feature_add_high_cat = 0
+        self.feature_add_cat_num = 10
+        self.feature_add_cat_cat = 10
+        self.order_num_cat_pair = {}
+
+        self.selected_cols = None
+
+        self.n_time = None
+        self.n_num = None
+        self.n_cat = None
+
+
+class Primitive(BaseEstimator, TransformerMixin):
+    def __init__(self, selected_columns=[], selected_type=None):
+        self.selected = selected_columns
+        self.selected_type = selected_type
+
+    @abstractmethod
+    def fit(self, X, y=None):
+        pass
+
+    @abstractmethod
+    def transform(self, X, y=None):
+        pass
+
+
+class TabScaler(Primitive):
+    scaler = None
+
+    def fit(self, X, y=None):
+        self.scaler = StandardScaler()
+        self.scaler.fit(X[:, self.selected], y)
+        return self
+
+    def transform(self, X, y=None):
+        X[:, self.selected] = self.scaler.transform(X[:, self.selected])
+        return X
+
+
+class CatEncoder(Primitive):
+    cat_to_int_label = {}
+
+    def fit(self, X, y=None):
+        for col_index in self.selected:
+            self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
+            for row_index in range(len(X)):
+                key = str(X[row_index, col_index])
+                if key not in self.cat_to_int_label[col_index]:
+                    self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index])
+        return self
+
+    def transform(self, X, y=None):
+        for col_index in self.selected:
+            for row_index in range(len(X)):
+                key = str(X[row_index, col_index])
+                X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan)
+        return X
+
+
+class FilterConstant(Primitive):
+    selected_cols = []
+
+    def fit(self, X, y=None):
+        self.selected_cols = np.where(np.max(X, axis=0) - np.min(X, axis=0) != 0)[0]
+        return self
+
+    def transform(self, X, y=None):
+        return X[:, self.selected_cols]
+
+
+class TimeDiff(Primitive):
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X, y=None):
+        x_time = X[:, self.selected]
+        len_cols = x_time.shape[1]
+        for i in range(len_cols):
+            for j in range(i + 1, len_cols):
+                X = np.append(X, np.expand_dims(x_time[:, i] - x_time[:, j], 1), 1)
+        return X
+
+
+class TabPCA(Primitive):
+    pca = None
+
+    def fit(self, X, y=None):
+        self.pca = PCA(n_components=0.99, svd_solver='full')
+        return self
+
+    def transform(self, X, y=None):
+        x_pca = self.pca.fit_transform(X[:, self.selected])
+        return np.concatenate([X, x_pca], axis=1)
+
+
+class CatCount(Primitive):
+    count_dict = {}
+
+    def fit(self, X, y=None):
+        for col in self.selected:
+            self.count_dict[col] = collections.Counter(X[:, col])
+        return self
+
+    def transform(self, X, y=None):
+        for col in self.selected:
+            gen_freq = np.vectorize(lambda key: self.count_dict[col][key])
+            X[:, col] = gen_freq(X[:, col])
+        return X
+
+
+class LogTransform(Primitive):
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X, y=None):
+        for col in self.selected:
+            X[:, col] = np.square(np.log(X[:, col]))
+        return X
+
+
+class Imputation(Primitive):
+    imputer = None
+
+    def fit(self, X, y=None):
+        self.imputer = SimpleImputer(strategy='most_frequent')
+        self.imputer.fit(X)
+        return self
+
+    def transform(self, X, y=None):
+        return self.imputer.transform(X)
+
+
+class FeatureFilter(Primitive):
+    def __init__(self, selected_columns=[], selected_type=None, threshold=0.001):
+        super().__init__(selected_columns, selected_type)
+        self.threshold = threshold
+        self.drop_columns = []
+
+    def fit(self, X, y=None):
+        for col in self.selected:
+            mu = abs(pearsonr(X[:, col], y)[0])
+            if np.isnan(mu):
+                mu = 0
+            if mu < self.threshold:
+                self.drop_columns.append(col)
+        return self
+
+    def transform(self, X, y=None):
+        X = np.delete(X, self.drop_columns, axis=1)
+        return X
+
+
+class FeatureImportance(Primitive):
+    def __init__(self, selected_columns=[], selected_type=None, threshold=0.001, task_type='classification'):
+        super().__init__(selected_columns, selected_type)
+        self.threshold = threshold
+        self.drop_columns = []
+        self.task_type = task_type
+
+    def fit(self, X, y=None):
+        if self.task_type == 'classification':
+            n_classes = len(set(y))
+            if n_classes == 2:
+                estimator = LGBMClassifier(silent=False,
+                                           verbose=-1,
+                                           n_jobs=1,
+                                           objective='binary')
+            else:
+                estimator = LGBMClassifier(silent=False,
+                                           verbose=-1,
+                                           n_jobs=1,
+                                           num_class=n_classes,
+                                           objective='multiclass')
+        elif self.task_type == 'regression':
+            estimator = LGBMRegressor(silent=False,
+                                      verbose=-1,
+                                      n_jobs=1,
+                                      objective='regression')
+        estimator.fit(X, y)
+        feature_importance = estimator.feature_importances_
+        feature_importance = feature_importance/feature_importance.mean()
+        self.drop_columns = np.where(feature_importance < self.threshold)[0]
+        return self
+
+    def transform(self, X, y=None):
+        X = np.delete(X, self.drop_columns, axis=1)
+        return X
+
+
+if __name__ == "__main__":
+    ntime, nnum, ncat = 4, 10, 8
+    nsample = 1000
+    x_num = np.random.random([nsample, nnum])
+    x_time = np.random.random([nsample, ntime])
+    x_cat = np.random.randint(0, 10, [nsample, ncat])
+
+    x_all = np.concatenate([x_num, x_time, x_cat], axis=1)
+    x_train = x_all[:int(nsample * 0.8), :]
+    x_test = x_all[int(nsample * 0.8):, :]
+
+    y_all = np.random.randint(0, 2, nsample)
+    y_train = y_all[:int(nsample * 0.8)]
+    y_test = y_all[int(nsample * 0.8):]
+
+    datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
+    print(x_train[:4, 20])
+    prep = TabularPreprocessor()
+    prep.fit(x_train, y_train, 24*60*60, datainfo)
+    x_new = prep.transform(x_train)
+
+    print("-----")
+    print(x_new[:4, 2])
+

From 1e7c94fb44cf455afbb999b93223c1ff75992363 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Tue, 3 Sep 2019 12:31:37 -0500
Subject: [PATCH 08/31] Fetch multiple trials from hyperopt

---
 autokaggle/estimators.py                      | 109 +++++++++++-------
 examples/tabular_classification_multiclass.py |   1 -
 2 files changed, 65 insertions(+), 45 deletions(-)

diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 0576430..b5dd7d9 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -10,13 +10,49 @@
 from sklearn.model_selection import StratifiedKFold, KFold
 from sklearn.svm import SVC
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer
 from joblib import dump, load
 
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 import hyperopt
-from hyperopt import tpe, hp, fmin, space_eval
+from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
+
+knn_classifier_params = {'n_neighbors': hp.choice('n_neighbors', range(2, 20)),
+                       'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
+                       'leaf_size': hp.choice('leaf_size', range(5, 50)),
+                       'metric': hp.choice('metric', ["euclidean", "manhattan",
+                                                      "chebyshev", "minkowski"
+                                                      ]),
+                       'p': hp.choice('p', range(1, 4)),
+                       }
+svc_params = {'C': hp.lognormal('C', 0, 1),
+                       'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
+                       'degree': hp.choice('degree', range(1, 6)),
+                       'gamma': hp.uniform('gamma', 0.001, 10000),
+                       'max_iter': 50000,
+                       }
+
+random_forest_classifier_params = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
+                       'max_features': hp.uniform('max_features', 0, 1.0),
+                       'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
+                       'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))}
+
+lgbm_classifier_params = {'boosting_type': 'gbdt',
+                       'min_split_gain': 0.1,
+                       'subsample': 0.8,
+                       'num_leaves': 80,
+                       'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
+                       'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
+                       'max_depth': hp.choice('max_depth', range(5, 10)),
+                       'n_estimators': hp.choice('n_estimators', range(50, 200)),
+                       'learning_rate': hp.lognormal('learning_rate', 0, 1),
+                       }
+
+adaboost_classifier_params = {'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
+                       'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
+                       'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
+                       }
 
 
 class TabularEstimator(BaseEstimator):
@@ -31,6 +67,7 @@ def __init__(self, path=None, verbose=True, time_limit=None):
         abs_cwd = os.path.split(os.path.abspath(__file__))[0]
         self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams)
         self.best_estimator_ = None
+        self.ensemble_models = True
     
     def fit(self, x, y):
         self.init_model(y)
@@ -54,25 +91,25 @@ def subsample(x, y, sample_percent):
         grid_train_x, grid_train_y = x[idx, :], y[idx]
         return grid_train_x, grid_train_y
 
-    def search(self, x, y, search_iter=40, folds=3):
+    def search(self, x, y, search_iter=4, folds=3):
         grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1)
         score_metric, skf = self.get_skf(folds)
 
-        self.hparams = space = hp.choice('classifier', [
+        self.hparams = hp.choice('classifier', [
             {'model': KNeighborsClassifier,
-             'param': {'n_neighbors':
-                           hp.choice('n_neighbors', range(3, 11)),
-                       'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree']),
-                       'leaf_size': hp.choice('leaf_size', range(1, 50)),
-                       'metric': hp.choice('metric', ["euclidean", "manhattan",
-                                                      "chebyshev", "minkowski"
-                                                      ])}
+             'param': knn_classifier_params
              },
             {'model': SVC,
-             'param': {'C': hp.lognormal('C', 0, 1),
-                       'kernel': hp.choice('kernel', ['rbf', 'poly', 'rbf', 'sigmoid']),
-                       'degree': hp.choice('degree', range(1, 15)),
-                       'gamma': hp.uniform('gamma', 0.001, 10000)}
+             'param': svc_params
+             },
+            {'model': RandomForestClassifier,
+             'param': random_forest_classifier_params
+             },
+            {'model': LGBMClassifier,
+             'param': lgbm_classifier_params
+             },
+            {'model': AdaBoostClassifier,
+             'param': adaboost_classifier_params
              }
         ])
 
@@ -81,39 +118,23 @@ def objective_func(args):
             loss = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
             print("CV Score:", loss)
             print("\n=================")
-            return 1 - loss
+            return {'loss': 1 - loss, 'status': STATUS_OK, 'space': args}
 
-        opt = space_eval(self.hparams, fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest,
+        trials = Trials()
+        opt = space_eval(self.hparams, fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials,
                                             max_evals=search_iter))
-        self.best_estimator_ = opt['model'](**opt['param'])
+        if self.ensemble_models:
+            best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False)
+            estimator_list = []
+            for i in range(2):
+                model_params = best_trials[i]['space']
+                est = model_params['model'](**model_params['param'])
+                estimator_list.append(est)
+            self.best_estimator_ = Ensembler(x, y, estimator_list)
+        else:
+            self.best_estimator_ = opt['model'](**opt['param'])
 
         return opt
-
-    # def search(self, x, y, search_iter=40, folds=3):
-    #     grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1)
-    #
-    #     if type(self.hparams) != list:
-    #         self.hparams = [self.hparams]
-    #
-    #     best_params = {}
-    #     for idx, search_space in enumerate(self.hparams):
-    #         best_params.update(search_space)
-    #         if self.verbose:
-    #             print("Step: {}".format(idx+1))
-    #             print("Search space:")
-    #             print(best_params)
-    #             score_metric, skf = self.get_skf(folds)
-    #         random_search = RandomizedSearchCV(self.estimator, param_distributions=best_params, n_iter=search_iter,
-    #                                    scoring=score_metric,
-    #                                    n_jobs=1, cv=skf, verbose=0, random_state=1001, iid=False)
-    #         random_search.fit(grid_train_x, grid_train_y)
-    #         best_params = random_search.best_params_
-    #         for key, value in best_params.items():
-    #             best_params[key] = [value]
-    #
-    #     self.best_estimator_ = random_search.best_estimator_
-    #
-    #     return random_search.best_params_
             
     @abstractmethod
     def save_model(self):
diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py
index 13612f6..0da02fe 100644
--- a/examples/tabular_classification_multiclass.py
+++ b/examples/tabular_classification_multiclass.py
@@ -3,7 +3,6 @@
 sys.path.append("..")
 # print(sys.path)
 from autokaggle import *
-
 if __name__ == '__main__':
     ntime, nnum, ncat = 4, 10, 8
     nsample = 10000

From c5158ceb0868978e1c31e443d8f2bd8b9c547e96 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Tue, 3 Sep 2019 17:15:02 -0500
Subject: [PATCH 09/31] Fix regression hparamspace

---
 autokaggle/__init__.py         |   3 +-
 autokaggle/auto_ml.py          |   2 +-
 autokaggle/ensemblers.py       |  57 +++++++--
 autokaggle/estimators.py       | 228 ++++++++++++++++-----------------
 examples/benchmarking.py       |   6 +-
 examples/tabular_regression.py |   2 +-
 6 files changed, 170 insertions(+), 128 deletions(-)

diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py
index 3961f6d..0245d5b 100644
--- a/autokaggle/__init__.py
+++ b/autokaggle/__init__.py
@@ -1,2 +1,3 @@
 from autokaggle.auto_ml import AutoKaggle
-from autokaggle.estimators import *
\ No newline at end of file
+from autokaggle.estimators import *
+from autokaggle.ensemblers import *
diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index f77e004..7cfcbff 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -13,7 +13,7 @@
 
 
 class AutoKaggle(BaseEstimator):
-    def __init__(self, estimator=LgbmClassifier(), preprocessor=TabularPreprocessor(), path=None, verbose=True):
+    def __init__(self, estimator=Classifier(), preprocessor=TabularPreprocessor(), path=None, verbose=True):
         """
         Initialization function for tabular supervised learner.
         """
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index 532072e..e931954 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -12,26 +12,65 @@
 from sklearn.model_selection import StratifiedKFold, KFold
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
 from joblib import dump, load
+from scipy import stats
+from lightgbm import LGBMClassifier, LGBMRegressor
 
 
 class RankedEnsembler:
-    def __init__(self, ensemble_method='max_voting'):
+    def __init__(self, estimator_list, ensemble_method='max_voting'):
         self.ensemble_method = ensemble_method
+        self.estimators = estimator_list
         
-    def fit(self, predictions, y_true):
-        pass
+    def fit(self, X, y):
+        for est in self.estimators:
+            est.fit(X, y)
     
-    def predict(self, predictions):
+    def predict(self, X):
+        predictions = np.zeros((len(X), len(self.estimators)))
+        for i, est in enumerate(self.estimators):
+            predictions[:, i] = est.predict(X)
+
         if self.ensemble_method == 'median':
-            return predictions.apply(np.median, axis=1).values
+            return np.median(predictions, axis=1)
         elif self.ensemble_method == 'mean':
-            return predictions.apply(np.average, axis=1).values
+            return np.mean(predictions, axis=1)
         elif self.ensemble_method == 'max':
-            return predictions.apply(np.max, axis=1).values
+            return np.max(predictions, axis=1)
         elif self.ensemble_method == 'min':
-            return predictions.apply(np.min, axis=1).values
+            return np.min(predictions, axis=1)
         elif self.ensemble_method == 'max_voting':
-            return predictions.apply(mode, axis=1).values
+            return stats.mode(predictions, axis=1)[0]
+
+
+class StackingEnsembler:
+    def __init__(self, estimator_list, objective):
+        self.estimator_list = estimator_list
+        self.objective = objective
+        if self.objective == 'regression':
+            self.stacking_estimator = LGBMRegressor(silent=False,
+                                           verbose=-1,
+                                           n_jobs=1,
+                                           objective=self.objective)
+        elif self.objective == 'multiclass' or self.objective == 'binary':
+            self.stacking_estimator = LGBMClassifier(silent=False,
+                                            verbose=-1,
+                                            n_jobs=1,
+                                            objective=self.objective)
+
+    def fit(self, X, y):
+        for est in self.estimator_list:
+            est.fit(X, y)
+        predictions = np.zeros((len(X), len(self.estimator_list)))
+        for i, est in enumerate(self.estimator_list):
+            predictions[:, i] = est.predict(X)
+        self.stacking_estimator.fit(predictions, y)
+
+    def predict(self, X):
+        predictions = np.zeros((len(X), len(self.estimator_list)))
+        for i, est in enumerate(self.estimator_list):
+            predictions[:, i] = est.predict(X)
+        return self.stacking_estimator.predict(predictions)
+
 
 class EnsembleSelection:
     def __init__(self, ensemble_size=25):
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index b5dd7d9..a133002 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -10,49 +10,88 @@
 from sklearn.model_selection import StratifiedKFold, KFold
 from sklearn.svm import SVC
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor,\
+    ExtraTreesRegressor
+from sklearn.linear_model import Ridge
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer
 from joblib import dump, load
 
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler
 import hyperopt
 from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
 
 knn_classifier_params = {'n_neighbors': hp.choice('n_neighbors', range(2, 20)),
-                       'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
-                       'leaf_size': hp.choice('leaf_size', range(5, 50)),
-                       'metric': hp.choice('metric', ["euclidean", "manhattan",
-                                                      "chebyshev", "minkowski"
-                                                      ]),
-                       'p': hp.choice('p', range(1, 4)),
-                       }
+                         'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
+                         'leaf_size': hp.choice('leaf_size', range(5, 50)),
+                         'metric': hp.choice('metric', ["euclidean", "manhattan",
+                                                        "chebyshev", "minkowski"
+                                                        ]),
+                         'p': hp.choice('p', range(1, 4)),
+                         }
+
 svc_params = {'C': hp.lognormal('C', 0, 1),
-                       'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
-                       'degree': hp.choice('degree', range(1, 6)),
-                       'gamma': hp.uniform('gamma', 0.001, 10000),
-                       'max_iter': 50000,
-                       }
+              'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
+              'degree': hp.choice('degree', range(1, 6)),
+              'gamma': hp.uniform('gamma', 0.001, 10000),
+              'max_iter': 50000,
+              }
 
 random_forest_classifier_params = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
-                       'max_features': hp.uniform('max_features', 0, 1.0),
-                       'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
-                       'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))}
+                                   'max_features': hp.uniform('max_features', 0, 1.0),
+                                   'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
+                                   'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))}
 
 lgbm_classifier_params = {'boosting_type': 'gbdt',
-                       'min_split_gain': 0.1,
-                       'subsample': 0.8,
-                       'num_leaves': 80,
-                       'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
-                       'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
-                       'max_depth': hp.choice('max_depth', range(5, 10)),
-                       'n_estimators': hp.choice('n_estimators', range(50, 200)),
-                       'learning_rate': hp.lognormal('learning_rate', 0, 1),
-                       }
+                          'min_split_gain': 0.1,
+                          'subsample': 0.8,
+                          'num_leaves': 80,
+                          'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
+                          'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
+                          'max_depth': hp.choice('max_depth', range(5, 10)),
+                          'n_estimators': hp.choice('n_estimators', range(50, 200)),
+                          'learning_rate': hp.lognormal('learning_rate', 0, 1),
+                          }
 
 adaboost_classifier_params = {'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
-                       'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
-                       'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
-                       }
+                              'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
+                              'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
+                              }
+
+extra_trees_regressor_params = {
+    'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)),
+    'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']),
+    'max_features': hp.uniform('max_features_extra_trees', 0, 1.0),
+    'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)),
+    'min_impurity_decrease': 0.0
+}
+ridge_params = {
+    'fit_intercept': True,
+    'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1),
+    'alpha': hp.loguniform('alpha_ridge', 1e-5, 10)
+}
+random_forest_regressor_params = {
+    'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']),
+    'max_features': hp.uniform('max_features', 0, 1.0),
+    'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
+    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
+}
+lgbm_regressor_params = {
+    'boosting_type': 'gbdt',
+    'min_split_gain': 0.1,
+    'subsample': 0.8,
+    'num_leaves': 80,
+    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
+    'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
+    'max_depth': hp.choice('max_depth', range(5, 10)),
+    'n_estimators': hp.choice('n_estimators', range(50, 200)),
+    'learning_rate': hp.lognormal('learning_rate', 0, 1),
+}
+adaboost_regressor_params = {
+    'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
+    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
+    'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
+}
 
 
 class TabularEstimator(BaseEstimator):
@@ -65,19 +104,21 @@ def __init__(self, path=None, verbose=True, time_limit=None):
         self.time_limit = time_limit
         self.objective = None
         abs_cwd = os.path.split(os.path.abspath(__file__))[0]
-        self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams)
+        # self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams)
         self.best_estimator_ = None
         self.ensemble_models = True
     
     def fit(self, x, y):
-        self.init_model(y)
+        if self.objective == 'classification':
+            n_classes = len(set(y))
+            self.objective = 'binary' if n_classes == 2 else 'multiclass'
         self.search(x, y)
         self.best_estimator_.fit(x, y)
         self.save_model()
     
     def predict(self, x, y=None):
-        y = self.best_estimator_.predict(x, )
-        return y
+        y_pred = self.best_estimator_.predict(x, )
+        return y_pred
     
     @staticmethod
     def subsample(x, y, sample_percent):
@@ -95,24 +136,6 @@ def search(self, x, y, search_iter=4, folds=3):
         grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1)
         score_metric, skf = self.get_skf(folds)
 
-        self.hparams = hp.choice('classifier', [
-            {'model': KNeighborsClassifier,
-             'param': knn_classifier_params
-             },
-            {'model': SVC,
-             'param': svc_params
-             },
-            {'model': RandomForestClassifier,
-             'param': random_forest_classifier_params
-             },
-            {'model': LGBMClassifier,
-             'param': lgbm_classifier_params
-             },
-            {'model': AdaBoostClassifier,
-             'param': adaboost_classifier_params
-             }
-        ])
-
         def objective_func(args):
             clf = args['model'](**args['param'])
             loss = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
@@ -121,8 +144,7 @@ def objective_func(args):
             return {'loss': 1 - loss, 'status': STATUS_OK, 'space': args}
 
         trials = Trials()
-        opt = space_eval(self.hparams, fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials,
-                                            max_evals=search_iter))
+        best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, max_evals=search_iter)
         if self.ensemble_models:
             best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False)
             estimator_list = []
@@ -130,35 +152,45 @@ def objective_func(args):
                 model_params = best_trials[i]['space']
                 est = model_params['model'](**model_params['param'])
                 estimator_list.append(est)
-            self.best_estimator_ = Ensembler(x, y, estimator_list)
+            # self.best_estimator_ = RankedEnsembler(estimator_list, ensemble_method='max_voting')
+            self.best_estimator_ = StackingEnsembler(estimator_list, objective=self.objective)
         else:
+            opt = space_eval(self.hparams, best)
             self.best_estimator_ = opt['model'](**opt['param'])
-
-        return opt
             
     @abstractmethod
     def save_model(self):
         pass
     
-    @abstractmethod
-    def init_model(self, y):
-        pass
-    
     @abstractmethod
     def get_skf(self, folds):
         pass
     
-    def __repr__(self):
-        return "TabularEstimator model"
-    
     
 class Classifier(TabularEstimator):
     """Classifier class.
-     It is used for tabular data classification with lightgbm classifier.
+     It is used for tabular data classification.
     """ 
     def __init__(self, path=None, verbose=True, time_limit=None):
         super().__init__(path, verbose, time_limit)
         self.objective = 'classification'
+        self.hparams = hp.choice('classifier', [
+            {'model': KNeighborsClassifier,
+             'param': knn_classifier_params
+             },
+            {'model': SVC,
+             'param': svc_params
+             },
+            {'model': RandomForestClassifier,
+             'param': random_forest_classifier_params
+             },
+            {'model': LGBMClassifier,
+             'param': lgbm_classifier_params
+             },
+            {'model': AdaBoostClassifier,
+             'param': adaboost_classifier_params
+             }
+        ])
 
     def get_skf(self, folds):
         if self.objective == 'binary':
@@ -172,75 +204,41 @@ def get_skf(self, folds):
     
 class Regressor(TabularEstimator):
     """Regressor class.
-    It is used for tabular data regression with lightgbm regressor.
+    It is used for tabular data regression.
     """
     def __init__(self, path=None, verbose=True, time_limit=None):
         super().__init__(path, verbose, time_limit)
         self.objective = 'regression'
+        self.hparams = hp.choice('regressor', [
+            {'model': ExtraTreesRegressor,
+             'param': extra_trees_regressor_params
+             },
+            {'model': Ridge,
+             'param': ridge_params
+             },
+            {'model': RandomForestRegressor,
+             'param': random_forest_regressor_params
+             },
+            {'model': LGBMRegressor,
+             'param': lgbm_regressor_params
+             },
+            {'model': AdaBoostRegressor,
+             'param': adaboost_regressor_params
+             }
+            ])
 
     def get_skf(self, folds):
         return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001)
     
     
 class LGBMMixIn:
-    _default_hyperparams = "lgbm_hp.json"
-    
     def save_model(self):
         self.best_estimator_.booster_.save_model(self.save_filename)
-    
-    def get_feature_importance(self):
-        if self.best_estimator_:
-            print('Feature Importance:')
-            print(self.best_estimator_.feature_importances_)
             
             
 class SklearnMixIn:
-    
     def save_model(self):
         dump(self.best_estimator_, self.save_filename)
         
     def load_model(self):
         self.best_estimator_ = load(self.save_filename)
-
-        
-class SVMClassifier(Classifier, SklearnMixIn):
-    _default_hyperparams = "svm_hp.json"
-        
-    def init_model(self, y):
-        n_classes = len(set(y))
-        self.objective = 'binary' if n_classes == 2 else 'multiclass'
-        self.estimator = SVC()
-
-        
-class RFClassifier(Classifier, SklearnMixIn):
-    _default_hyperparams = "rf_hp.json"
-        
-    def init_model(self, y):
-        n_classes = len(set(y))
-        self.objective = 'binary' if n_classes == 2 else 'multiclass'
-        self.estimator = RandomForestClassifier()
-        
-class LgbmClassifier(Classifier, LGBMMixIn):
-    def init_model(self, y):
-        n_classes = len(set(y))
-        if n_classes == 2:
-            self.objective = 'binary'
-            self.estimator = LGBMClassifier(silent=False,
-                                       verbose=-1,
-                                       n_jobs=1,
-                                       objective=self.objective)
-        else:
-            self.objective = 'multiclass'
-            self.estimator = LGBMClassifier(silent=False,
-                                       verbose=-1,
-                                       n_jobs=1,
-                                       num_class=n_classes,
-                                       objective=self.objective)
-
-            
-class LgbmRegressor(Regressor, LGBMMixIn):
-    def init_model(self, y):
-        self.estimator = LGBMRegressor(silent=False,
-                                  verbose=-1,
-                                  n_jobs=1,
-                                  objective=self.objective)
\ No newline at end of file
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index c62b1dc..a5e7d69 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -1,3 +1,5 @@
+import string
+import random
 import sys
 sys.path.append("../")
 import numpy as np
@@ -15,10 +17,12 @@
 import warnings
 from abc import abstractmethod
 
+
 def generate_rand_string(size):
     chars = string.ascii_uppercase + string.digits
     return ''.join(random.choice(chars) for _ in range(size))
 
+
 class BenchmarkingBase:
     results = None
     cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", "balanced_accuracy", "F1_score", "AUC"]
@@ -118,7 +122,7 @@ def evaluate(self, task_id, time_limit=10*60):
         if task.task_type == 'Supervised Classification':
             automl = AutoKaggle() if not self.estimator_type else AutoKaggle(self.estimator_type)
         elif task.task_type == 'Supervised Regression':
-            automl = AutoKaggle(LgbmRegressor) if not self.estimator_type else AutoKaggle(self.estimator_type)
+            automl = AutoKaggle(Regressor()) if not self.estimator_type else AutoKaggle(self.estimator_type)
         else:
             print("UNSUPPORTED TASK_TYPE")
             assert(0)
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index ccb2064..c338dd2 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -18,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggle(LgbmRegressor())
+    clf = AutoKaggle(Regressor())
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 

From 6dd78ae12e5bd2fa19d1ac6dc5fb93324c1b2407 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Wed, 4 Sep 2019 10:46:37 -0500
Subject: [PATCH 10/31] Resolve fziling datasets

---
 autokaggle/estimators.py   |  56 +++++++++--------
 autokaggle/preprocessor.py | 125 +++++++++++++++++++++++++++++--------
 examples/benchmarking.py   | 102 ++++++++++++++++--------------
 3 files changed, 185 insertions(+), 98 deletions(-)

diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index a133002..3209710 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -21,6 +21,8 @@
 import hyperopt
 from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
 
+
+# TODO: Way to change the default hparams
 knn_classifier_params = {'n_neighbors': hp.choice('n_neighbors', range(2, 20)),
                          'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
                          'leaf_size': hp.choice('leaf_size', range(5, 50)),
@@ -104,15 +106,18 @@ def __init__(self, path=None, verbose=True, time_limit=None):
         self.time_limit = time_limit
         self.objective = None
         abs_cwd = os.path.split(os.path.abspath(__file__))[0]
-        # self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams)
         self.best_estimator_ = None
-        self.ensemble_models = True
+        self.use_ensembling = False
+        self.hparams = None
+        self.num_estimators_ensemble = 2
+        self.ensemble_strategy = 'ranked_ensembling'
+        self.ensemble_method = 'max_voting'
     
     def fit(self, x, y):
         if self.objective == 'classification':
             n_classes = len(set(y))
             self.objective = 'binary' if n_classes == 2 else 'multiclass'
-        self.search(x, y)
+        self.best_estimator_, _ = self.search(x, y)
         self.best_estimator_.fit(x, y)
         self.save_model()
     
@@ -122,6 +127,7 @@ def predict(self, x, y=None):
     
     @staticmethod
     def subsample(x, y, sample_percent):
+        # TODO: Add way to balance the subsample
         # Set small sample for hyper-param search
         if x.shape[0] > 600:
             grid_train_percentage = max(600.0 / x.shape[0], sample_percent)
@@ -132,31 +138,37 @@ def subsample(x, y, sample_percent):
         grid_train_x, grid_train_y = x[idx, :], y[idx]
         return grid_train_x, grid_train_y
 
-    def search(self, x, y, search_iter=4, folds=3):
-        grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1)
+    def search(self, x, y, search_iter=4, folds=3, sample_percent=0.1):
+        grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=sample_percent)
         score_metric, skf = self.get_skf(folds)
 
         def objective_func(args):
             clf = args['model'](**args['param'])
-            loss = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
-            print("CV Score:", loss)
-            print("\n=================")
-            return {'loss': 1 - loss, 'status': STATUS_OK, 'space': args}
+            eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
+            if self.verbose:
+                print("CV Score:", eval_score)
+                print("\n=================")
+            return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
 
         trials = Trials()
         best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, max_evals=search_iter)
-        if self.ensemble_models:
+        if self.use_ensembling:
             best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False)
             estimator_list = []
-            for i in range(2):
+            for i in range(self.num_estimators_ensemble):
                 model_params = best_trials[i]['space']
                 est = model_params['model'](**model_params['param'])
                 estimator_list.append(est)
-            # self.best_estimator_ = RankedEnsembler(estimator_list, ensemble_method='max_voting')
-            self.best_estimator_ = StackingEnsembler(estimator_list, objective=self.objective)
+            if self.ensemble_strategy == 'ranked_ensembling':
+                best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.ensemble_method)
+            elif self.ensemble_strategy == 'stacking':
+                best_estimator_ = StackingEnsembler(estimator_list, objective=self.objective)
+            else:
+                best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.ensemble_method)
         else:
             opt = space_eval(self.hparams, best)
-            self.best_estimator_ = opt['model'](**opt['param'])
+            best_estimator_ = opt['model'](**opt['param'])
+        return best_estimator_, trials
             
     @abstractmethod
     def save_model(self):
@@ -174,6 +186,7 @@ class Classifier(TabularEstimator):
     def __init__(self, path=None, verbose=True, time_limit=None):
         super().__init__(path, verbose, time_limit)
         self.objective = 'classification'
+        # TODO: add choice to the set of estimators
         self.hparams = hp.choice('classifier', [
             {'model': KNeighborsClassifier,
              'param': knn_classifier_params
@@ -209,6 +222,7 @@ class Regressor(TabularEstimator):
     def __init__(self, path=None, verbose=True, time_limit=None):
         super().__init__(path, verbose, time_limit)
         self.objective = 'regression'
+        # TODO: add choice to the set of estimators
         self.hparams = hp.choice('regressor', [
             {'model': ExtraTreesRegressor,
              'param': extra_trees_regressor_params
@@ -229,16 +243,4 @@ def __init__(self, path=None, verbose=True, time_limit=None):
 
     def get_skf(self, folds):
         return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001)
-    
-    
-class LGBMMixIn:
-    def save_model(self):
-        self.best_estimator_.booster_.save_model(self.save_filename)
-            
-            
-class SklearnMixIn:
-    def save_model(self):
-        dump(self.best_estimator_, self.save_filename)
-        
-    def load_model(self):
-        self.best_estimator_ = load(self.save_filename)
+
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index f155a73..84a9460 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -3,7 +3,7 @@
 import scipy
 from scipy.stats import pearsonr
 from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer
 from sklearn.base import TransformerMixin
 from sklearn.base import BaseEstimator
 from sklearn.impute import SimpleImputer
@@ -164,32 +164,73 @@ def __init__(self, selected_columns=[], selected_type=None):
         self.selected = selected_columns
         self.selected_type = selected_type
 
-    @abstractmethod
     def fit(self, X, y=None):
+        if not self.selected:
+            return self
+        return self._fit(X, y)
+
+    def transform(self, X, y=None):
+        if not self.selected:
+            return X
+        return self._transform(X, y)
+
+    @abstractmethod
+    def _fit(self, X, y=None):
         pass
 
     @abstractmethod
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         pass
 
 
 class TabScaler(Primitive):
     scaler = None
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         self.scaler = StandardScaler()
         self.scaler.fit(X[:, self.selected], y)
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         X[:, self.selected] = self.scaler.transform(X[:, self.selected])
         return X
 
 
+class BoxCox(Primitive):
+    transformer = None
+
+    def _fit(self, X, y=None):
+        self.transformer = PowerTransformer()
+        self.transformer.fit(X[:, self.selected], y)
+        return self
+
+    def _transform(self, X, y=None):
+        X[:, self.selected] = self.transformer.transform(X[:, self.selected])
+        return X
+
+
+class Binning(Primitive):
+    binner = None
+
+    def __init__(self, selected_columns=[], selected_type=None, strategy='quantile', encoding='ordinal'):
+        super().__init__(selected_columns, selected_type)
+        self.strategy = strategy
+        self.encoding = encoding
+
+    def _fit(self, X, y=None):
+        self.binner = KBinsDiscretizer(strategy=self.strategy, encode=self.encoding)
+        self.binner.fit(X[:, self.selected], y)
+        return self
+
+    def _transform(self, X, y=None):
+        X[:, self.selected] = self.binner.transform(X[:, self.selected])
+        return X
+
+
 class CatEncoder(Primitive):
     cat_to_int_label = {}
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         for col_index in self.selected:
             self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
             for row_index in range(len(X)):
@@ -198,7 +239,27 @@ def fit(self, X, y=None):
                     self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index])
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
+        for col_index in self.selected:
+            for row_index in range(len(X)):
+                key = str(X[row_index, col_index])
+                X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan)
+        return X
+
+
+class TargetEncoder(Primitive):
+    cat_to_int_label = {}
+
+    def _fit(self, X, y=None):
+        for col_index in self.selected:
+            self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
+            for row_index in range(len(X)):
+                key = str(X[row_index, col_index])
+                if key not in self.cat_to_int_label[col_index]:
+                    self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index])
+        return self
+
+    def _transform(self, X, y=None):
         for col_index in self.selected:
             for row_index in range(len(X)):
                 key = str(X[row_index, col_index])
@@ -209,20 +270,20 @@ def transform(self, X, y=None):
 class FilterConstant(Primitive):
     selected_cols = []
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         self.selected_cols = np.where(np.max(X, axis=0) - np.min(X, axis=0) != 0)[0]
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         return X[:, self.selected_cols]
 
 
 class TimeDiff(Primitive):
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         x_time = X[:, self.selected]
         len_cols = x_time.shape[1]
         for i in range(len_cols):
@@ -231,27 +292,40 @@ def transform(self, X, y=None):
         return X
 
 
+class TimeOffset(Primitive):
+    start_time = None
+
+    def _fit(self, X, y=None):
+        self.start_time = np.min(X[self.selected], axis=0)
+        return self
+
+    def _transform(self, X, y=None):
+        X[self.selected] = X[self.selected] - self.start_time
+        return X
+
+
 class TabPCA(Primitive):
     pca = None
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         self.pca = PCA(n_components=0.99, svd_solver='full')
+        self.pca.fit(X[:, self.selected])
         return self
 
-    def transform(self, X, y=None):
-        x_pca = self.pca.fit_transform(X[:, self.selected])
+    def _transform(self, X, y=None):
+        x_pca = self.pca.transform(X[:, self.selected])
         return np.concatenate([X, x_pca], axis=1)
 
 
 class CatCount(Primitive):
     count_dict = {}
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         for col in self.selected:
             self.count_dict[col] = collections.Counter(X[:, col])
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         for col in self.selected:
             gen_freq = np.vectorize(lambda key: self.count_dict[col][key])
             X[:, col] = gen_freq(X[:, col])
@@ -260,10 +334,10 @@ def transform(self, X, y=None):
 
 class LogTransform(Primitive):
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         for col in self.selected:
             X[:, col] = np.square(np.log(X[:, col]))
         return X
@@ -272,12 +346,13 @@ def transform(self, X, y=None):
 class Imputation(Primitive):
     imputer = None
 
-    def fit(self, X, y=None):
-        self.imputer = SimpleImputer(strategy='most_frequent')
+    def _fit(self, X, y=None):
+        # TODO implement most_frequent
+        self.imputer = SimpleImputer(strategy='constant', fill_value=0)
         self.imputer.fit(X)
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         return self.imputer.transform(X)
 
 
@@ -287,7 +362,7 @@ def __init__(self, selected_columns=[], selected_type=None, threshold=0.001):
         self.threshold = threshold
         self.drop_columns = []
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         for col in self.selected:
             mu = abs(pearsonr(X[:, col], y)[0])
             if np.isnan(mu):
@@ -296,7 +371,7 @@ def fit(self, X, y=None):
                 self.drop_columns.append(col)
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         X = np.delete(X, self.drop_columns, axis=1)
         return X
 
@@ -308,7 +383,7 @@ def __init__(self, selected_columns=[], selected_type=None, threshold=0.001, tas
         self.drop_columns = []
         self.task_type = task_type
 
-    def fit(self, X, y=None):
+    def _fit(self, X, y=None):
         if self.task_type == 'classification':
             n_classes = len(set(y))
             if n_classes == 2:
@@ -333,7 +408,7 @@ def fit(self, X, y=None):
         self.drop_columns = np.where(feature_importance < self.threshold)[0]
         return self
 
-    def transform(self, X, y=None):
+    def _transform(self, X, y=None):
         X = np.delete(X, self.drop_columns, axis=1)
         return X
 
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index a5e7d69..4ff1a59 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -8,8 +8,8 @@
 import sklearn.datasets
 from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, balanced_accuracy_score,\
 mean_absolute_error, mean_squared_error
-from autosklearn.regression import AutoSklearnRegressor
-from autosklearn.classification import AutoSklearnClassifier
+# from autosklearn.regression import AutoSklearnRegressor
+# from autosklearn.classification import AutoSklearnClassifier
 from autokaggle import *
 from autokaggle.utils import *
 import openml
@@ -142,50 +142,50 @@ def evaluate(self, task_id, time_limit=10*60):
         print(result)
         return result
 
-    
-class BenchmarkingAutoSklearn(BenchmarkingBase):
-    def get_data_info(self, categorical_indicator):
-        return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator]
-    
-    def evaluate(self, task_id, time_limit=10*60):
-        task_info = ["autosklearn", task_id, time_limit]
-        task = openml.tasks.get_task(task_id)
-        train_indices, test_indices = task.get_train_test_split_indices()
-        dataset = task.get_dataset()
-        X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array')
-
-        x_train, y_train = X[train_indices], y[train_indices]
-        x_test, y_test = X[test_indices], y[test_indices]
-
-        # Create feature type list from openml.org indicator
-        feat_type = self.get_data_info(categorical_indicator)
-
-        # Train
-        if task.task_type == 'Supervised Classification':
-            automl = AutoSklearnClassifier(
-                time_left_for_this_task=time_limit,
-                per_run_time_limit=time_limit//10, **kwargs)
-        elif task.task_type == 'Supervised Regression':
-            automl = AutoSklearnRegressor(
-                time_left_for_this_task=time_limit,
-                per_run_time_limit=time_limit//10, **kwargs)
-        else:
-            print("UNSUPPORTED TASK_TYPE")
-            assert(0)
-
-        automl.fit(x_train, y_train, feat_type=feat_type)
-
-        y_hat = automl.predict(x_test)
-        if task.task_type == 'Supervised Classification':
-            is_binary = True if len(task.class_labels) <= 2 else False
-            result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary)
-            self.cls_results.loc[len(self.cls_results)] = result
-        elif task.task_type == 'Supervised Regression':
-            result = task_info + self.measure_performance_rgs(y_test, y_hat)
-            self.rgs_results.loc[len(self.rgs_results)] = result
-        self.results.append(result)
-        print(result)
-        return result
+#
+# class BenchmarkingAutoSklearn(BenchmarkingBase):
+#     def get_data_info(self, categorical_indicator):
+#         return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator]
+#
+#     def evaluate(self, task_id, time_limit=10*60):
+#         task_info = ["autosklearn", task_id, time_limit]
+#         task = openml.tasks.get_task(task_id)
+#         train_indices, test_indices = task.get_train_test_split_indices()
+#         dataset = task.get_dataset()
+#         X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array')
+#
+#         x_train, y_train = X[train_indices], y[train_indices]
+#         x_test, y_test = X[test_indices], y[test_indices]
+#
+#         # Create feature type list from openml.org indicator
+#         feat_type = self.get_data_info(categorical_indicator)
+#
+#         # Train
+#         if task.task_type == 'Supervised Classification':
+#             automl = AutoSklearnClassifier(
+#                 time_left_for_this_task=time_limit,
+#                 per_run_time_limit=time_limit//10, **kwargs)
+#         elif task.task_type == 'Supervised Regression':
+#             automl = AutoSklearnRegressor(
+#                 time_left_for_this_task=time_limit,
+#                 per_run_time_limit=time_limit//10, **kwargs)
+#         else:
+#             print("UNSUPPORTED TASK_TYPE")
+#             assert(0)
+#
+#         automl.fit(x_train, y_train, feat_type=feat_type)
+#
+#         y_hat = automl.predict(x_test)
+#         if task.task_type == 'Supervised Classification':
+#             is_binary = True if len(task.class_labels) <= 2 else False
+#             result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary)
+#             self.cls_results.loc[len(self.cls_results)] = result
+#         elif task.task_type == 'Supervised Regression':
+#             result = task_info + self.measure_performance_rgs(y_test, y_hat)
+#             self.rgs_results.loc[len(self.rgs_results)] = result
+#         self.results.append(result)
+#         print(result)
+#         return result
     
 def get_dataset_ids(task_ids):
     if type(task_ids) == list:
@@ -208,6 +208,7 @@ def get_task_info(task_ids):
         dataset_list.append(dataset)
     return dataset_list, task_types
 
+
 def get_dataset_properties(task_ids):
     dataset_list, task_types = get_task_info(task_ids)
     df = pd.DataFrame(columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", "#Date"])
@@ -274,3 +275,12 @@ def get_box_plot(data, task_id, metric):
         auto_kaggle.append(med_ak)
     temp = pd.DataFrame(data={"Autokaggle":auto_kaggle, "AutoSklearn":auto_sklearn})
     temp.boxplot()
+
+
+if __name__ == "__main__":
+    regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839]
+    classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101,
+                                9914, 3020, 3524, 3573, 3962]
+    ak = BenchmarkingAutoKaggle()
+    # ak.run_automation(classification_task_list)
+    ak.evaluate(24)
\ No newline at end of file

From d6a606087fba2a702188ec673883751e32b37d2f Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Wed, 4 Sep 2019 12:40:44 -0500
Subject: [PATCH 11/31] Before shifting to pandas

---
 autokaggle/estimators.py   |  6 ++++++
 autokaggle/preprocessor.py | 36 ++++++++++++++++++------------------
 examples/benchmarking.py   |  6 +++++-
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 3209710..dccff50 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -20,6 +20,7 @@
 from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler
 import hyperopt
 from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
+from imblearn.over_sampling import SMOTE, SMOTENC
 
 
 # TODO: Way to change the default hparams
@@ -117,6 +118,7 @@ def fit(self, x, y):
         if self.objective == 'classification':
             n_classes = len(set(y))
             self.objective = 'binary' if n_classes == 2 else 'multiclass'
+        x, y = self.resample(x, y)
         self.best_estimator_, _ = self.search(x, y)
         self.best_estimator_.fit(x, y)
         self.save_model()
@@ -125,6 +127,10 @@ def predict(self, x, y=None):
         y_pred = self.best_estimator_.predict(x, )
         return y_pred
     
+    @staticmethod
+    def resample(X, y):
+        return SMOTE(sampling_strategy='auto').fit_resample(X, y)
+
     @staticmethod
     def subsample(x, y, sample_percent):
         # TODO: Add way to balance the subsample
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 84a9460..55943a9 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -247,24 +247,24 @@ def _transform(self, X, y=None):
         return X
 
 
-class TargetEncoder(Primitive):
-    cat_to_int_label = {}
-
-    def _fit(self, X, y=None):
-        for col_index in self.selected:
-            self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
-            for row_index in range(len(X)):
-                key = str(X[row_index, col_index])
-                if key not in self.cat_to_int_label[col_index]:
-                    self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index])
-        return self
-
-    def _transform(self, X, y=None):
-        for col_index in self.selected:
-            for row_index in range(len(X)):
-                key = str(X[row_index, col_index])
-                X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan)
-        return X
+# class TargetEncoder(Primitive):
+#     cat_to_int_label = {}
+#
+#     def _fit(self, X, y=None):
+#         for col_index in self.selected:
+#             self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
+#             for row_index in range(len(X)):
+#                 key = str(X[row_index, col_index])
+#                 if key not in self.cat_to_int_label[col_index]:
+#                     self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index])
+#         return self
+#
+#     def _transform(self, X, y=None):
+#         for col_index in self.selected:
+#             for row_index in range(len(X)):
+#                 key = str(X[row_index, col_index])
+#                 X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan)
+#         return X
 
 
 class FilterConstant(Primitive):
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 4ff1a59..30d8dec 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -283,4 +283,8 @@ def get_box_plot(data, task_id, metric):
                                 9914, 3020, 3524, 3573, 3962]
     ak = BenchmarkingAutoKaggle()
     # ak.run_automation(classification_task_list)
-    ak.evaluate(24)
\ No newline at end of file
+    import time
+    t1 = time.time()
+    ak.evaluate(45)
+    t2 = time.time()
+    print(t2-t1)
\ No newline at end of file

From 8d0dc6bd22a553ccdd6083a6941bbef3336016c2 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Wed, 4 Sep 2019 19:07:26 -0500
Subject: [PATCH 12/31] Shift to pandas, add 2nd order and target encoding

---
 autokaggle/preprocessor.py | 237 ++++++++++++++++++++++++++++---------
 examples/benchmarking.py   |   4 +-
 2 files changed, 184 insertions(+), 57 deletions(-)

diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 55943a9..bead35b 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -1,6 +1,8 @@
 import numpy as np
-from pandas import DataFrame
+import pandas as pd
 import scipy
+import math
+import itertools
 from scipy.stats import pearsonr
 from sklearn.decomposition import PCA
 from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer
@@ -70,6 +72,9 @@ def fit(self, raw_x, y, time_limit, data_info):
         self.cat_col = list(np.where(self.data_info == 'CAT')[0])
         self.num_col = list(np.where(self.data_info == 'NUM')[0])
         self.time_col = list(np.where(self.data_info == 'TIME')[0])
+        self.cat_col = [str(i) for i in self.cat_col]
+        self.num_col = [str(i) for i in self.num_col]
+        self.time_col = [str(i) for i in self.time_col]
 
         print('#TIME features: {}'.format(self.n_time))
         print('#NUM features: {}'.format(self.n_num))
@@ -78,12 +83,21 @@ def fit(self, raw_x, y, time_limit, data_info):
         # Convert sparse to dense if needed
         raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
 
+        # To pandas
+        raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
+
         self.pipeline = Pipeline([
-            ('label_encoder', CatEncoder(selected_columns=self.cat_col)),
+            # ('cat_num_encoder', CatNumEncoder(selected_columns=self.cat_col, selected_num=self.num_col)),
+            ('cat_encoder', TargetEncoder(selected_columns=self.cat_col)),
+            # ('cat_cat_encoder', CatCatEncoder(selected_columns=self.cat_col)),
             ('imputer', Imputation(selected_columns=self.cat_col + self.num_col + self.time_col)),
             ('scaler', TabScaler(selected_columns=self.num_col)),
+            ('boxcox', BoxCox(selected_columns=self.num_col)),
+            ('binning', Binning(selected_columns=self.num_col)),
+            ('log_square', LogTransform(selected_columns=self.num_col)),
             ('pca', TabPCA(selected_columns=self.num_col)),
             ('time_diff', TimeDiff(selected_columns=self.time_col)),
+            ('time_offset', TimeOffset(selected_columns=self.time_col)),
             ('filter', FilterConstant(selected_columns=self.time_col + self.num_col + self.cat_col)),
             ('pearson_corr', FeatureFilter(selected_columns=self.time_col + self.num_col + self.cat_col)),
             ('lgbm_feat_selection', FeatureImportance(selected_columns=self.time_col + self.num_col + self.cat_col)),
@@ -114,7 +128,10 @@ def transform(self, raw_x, time_limit=None):
 
         # Convert sparse to dense if needed
         raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
-        return self.pipeline.transform(raw_x)
+
+        # To pandas
+        raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
+        return self.pipeline.transform(raw_x).values
 
     @staticmethod
     def extract_data_info(raw_x):
@@ -165,6 +182,7 @@ def __init__(self, selected_columns=[], selected_type=None):
         self.selected_type = selected_type
 
     def fit(self, X, y=None):
+        self.selected = list(set(X.columns)  & set(self.selected))
         if not self.selected:
             return self
         return self._fit(X, y)
@@ -188,11 +206,11 @@ class TabScaler(Primitive):
 
     def _fit(self, X, y=None):
         self.scaler = StandardScaler()
-        self.scaler.fit(X[:, self.selected], y)
+        self.scaler.fit(X[self.selected], y)
         return self
 
     def _transform(self, X, y=None):
-        X[:, self.selected] = self.scaler.transform(X[:, self.selected])
+        X[self.selected] = self.scaler.transform(X[self.selected])
         return X
 
 
@@ -201,11 +219,11 @@ class BoxCox(Primitive):
 
     def _fit(self, X, y=None):
         self.transformer = PowerTransformer()
-        self.transformer.fit(X[:, self.selected], y)
+        self.transformer.fit(X[self.selected], y)
         return self
 
     def _transform(self, X, y=None):
-        X[:, self.selected] = self.transformer.transform(X[:, self.selected])
+        X[self.selected] = self.transformer.transform(X[self.selected])
         return X
 
 
@@ -219,18 +237,19 @@ def __init__(self, selected_columns=[], selected_type=None, strategy='quantile',
 
     def _fit(self, X, y=None):
         self.binner = KBinsDiscretizer(strategy=self.strategy, encode=self.encoding)
-        self.binner.fit(X[:, self.selected], y)
+        self.binner.fit(X[self.selected], y)
         return self
 
     def _transform(self, X, y=None):
-        X[:, self.selected] = self.binner.transform(X[:, self.selected])
+        X[self.selected] = self.binner.transform(X[self.selected])
         return X
 
 
 class CatEncoder(Primitive):
-    cat_to_int_label = {}
+    cat_to_int_label = None
 
     def _fit(self, X, y=None):
+        self.cat_to_int_label = {}
         for col_index in self.selected:
             self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
             for row_index in range(len(X)):
@@ -247,35 +266,141 @@ def _transform(self, X, y=None):
         return X
 
 
-# class TargetEncoder(Primitive):
-#     cat_to_int_label = {}
-#
-#     def _fit(self, X, y=None):
-#         for col_index in self.selected:
-#             self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
-#             for row_index in range(len(X)):
-#                 key = str(X[row_index, col_index])
-#                 if key not in self.cat_to_int_label[col_index]:
-#                     self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index])
-#         return self
-#
-#     def _transform(self, X, y=None):
-#         for col_index in self.selected:
-#             for row_index in range(len(X)):
-#                 key = str(X[row_index, col_index])
-#                 X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan)
-#         return X
+class TargetEncoder(Primitive):
+    target_encoding_map = None
+
+    @staticmethod
+    def calc_smooth_mean(df, by, on, alpha=5):
+        # Compute the global mean
+        mean = df[on].mean()
+
+        # Compute the number of values and the mean of each group
+        agg = df.groupby(by)[on].agg(['count', 'mean'])
+        counts = agg['count']
+        means = agg['mean']
+
+        # Compute the "smoothed" means
+        smooth = (counts * means + alpha * mean) / (counts + alpha)
+        return smooth
+
+    def _fit(self, X, y=None):
+        self.target_encoding_map = {}
+        X['target'] = y
+        for col in self.selected:
+            self.target_encoding_map[col] = self.calc_smooth_mean(X, col, 'target', alpha=5)
+        X.drop('target', axis=1, inplace=True)
+        return self
+
+    def _transform(self, X, y=None):
+        for col in self.selected:
+            X[col] = X[col].map(self.target_encoding_map[col])
+        return X
+
+
+class CatCatEncoder(Primitive):
+    def __init__(self, selected_columns=[], selected_type=None, strategy='count'):
+        super().__init__(selected_columns, selected_type)
+        self.strategy = strategy
+        self.cat_cat_map = {}
+
+    @staticmethod
+    def cat_cat_count(df, col1, col2, strategy='count'):
+        if strategy == 'count':
+            mapping = df.groupby([col1])[col2].count()
+        elif strategy == 'nunique':
+            mapping = df.groupby([col1])[col2].nunique()
+        else:
+            mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[col2].nunique()
+        return mapping
+
+    def _fit(self, X, y=None):
+        for col1, col2 in itertools.combinations(self.selected, 2):
+            self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(X, col1, col2, self.strategy)
+        return self
+
+    def _transform(self, X, y=None):
+        for col1, col2 in itertools.combinations(self.selected, 2):
+            if col1 + '_cross_' + col2 in self.cat_cat_map:
+                X[col1 + '_cross_' + col2] = X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2])
+        return X
+
+
+class CatNumEncoder(Primitive):
+    def __init__(self, selected_columns=[], selected_type=None, selected_num=[], strategy='mean'):
+        super().__init__(selected_columns, selected_type)
+        self.selected_num = selected_num
+        self.strategy = strategy
+        self.cat_num_map = {}
+
+    @staticmethod
+    def cat_num_interaction(df, col1, col2, method='mean'):
+        if method == 'mean':
+            mapping = df.groupby([col1])[col2].mean()
+        elif method == 'std':
+            mapping = df.groupby([col1])[col2].std()
+        elif method == 'max':
+            mapping = df.groupby([col1])[col2].max()
+        elif method == 'min':
+            mapping = df.groupby([col1])[col2].min()
+        else:
+            mapping = df.groupby([col1])[col2].mean()
+
+        return mapping
+
+    def _fit(self, X, y=None):
+        for col1 in self.selected:
+            for col2 in self.selected_num:
+                self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(X, col1, col2, self.strategy)
+        return self
+
+    def _transform(self, X, y=None):
+        for col1 in self.selected:
+            for col2 in self.selected_num:
+                if col1 + '_cross_' + col2 in self.cat_num_map:
+                    X[col1 + '_cross_' + col2] = X[col1].map(self.cat_num_map[col1 + '_cross_' + col2])
+        return X
+
+
+class CatBinEncoder(Primitive):
+    def __init__(self, selected_columns=[], selected_type=None, selected_bin=[], strategy='percent_true'):
+        super().__init__(selected_columns, selected_type)
+        self.selected_bin = selected_bin
+        self.strategy = strategy
+        self.cat_bin_map = {}
+
+    @staticmethod
+    def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
+        if strategy == 'percent_true':
+            mapping = df.groupby([col1])[col2].mean()
+        elif strategy == 'count':
+            mapping = df.groupby([col1])[col2].count()
+        else:
+            mapping = df.groupby([col1])[col2].mean()
+        return mapping
+
+    def _fit(self, X, y=None):
+        for col1 in self.selected:
+            for col2 in self.selected_bin:
+                self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(X, col1, col2, self.strategy)
+        return self
+
+    def _transform(self, X, y=None):
+        for col1 in self.selected:
+            for col2 in self.selected_bin:
+                if col1 + '_cross_' + col2 in self.cat_bin_map:
+                    X[col1 + '_cross_' + col2] = X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2])
+        return X
 
 
 class FilterConstant(Primitive):
-    selected_cols = []
+    selected_cols = None
 
     def _fit(self, X, y=None):
-        self.selected_cols = np.where(np.max(X, axis=0) - np.min(X, axis=0) != 0)[0]
+        self.selected_cols = X.columns[(X.max(axis=0) - X.min(axis=0) != 0)].tolist()
         return self
 
     def _transform(self, X, y=None):
-        return X[:, self.selected_cols]
+        return X[self.selected_cols]
 
 
 class TimeDiff(Primitive):
@@ -284,11 +409,8 @@ def _fit(self, X, y=None):
         return self
 
     def _transform(self, X, y=None):
-        x_time = X[:, self.selected]
-        len_cols = x_time.shape[1]
-        for i in range(len_cols):
-            for j in range(i + 1, len_cols):
-                X = np.append(X, np.expand_dims(x_time[:, i] - x_time[:, j], 1), 1)
+        for a, b in itertools.combinations(self.selected, 2):
+            X[a + '-' + b] = X[a] - X[b]
         return X
 
 
@@ -296,7 +418,7 @@ class TimeOffset(Primitive):
     start_time = None
 
     def _fit(self, X, y=None):
-        self.start_time = np.min(X[self.selected], axis=0)
+        self.start_time = X[self.selected].min(axis=0)
         return self
 
     def _transform(self, X, y=None):
@@ -309,26 +431,27 @@ class TabPCA(Primitive):
 
     def _fit(self, X, y=None):
         self.pca = PCA(n_components=0.99, svd_solver='full')
-        self.pca.fit(X[:, self.selected])
+        self.pca.fit(X[self.selected])
         return self
 
     def _transform(self, X, y=None):
-        x_pca = self.pca.transform(X[:, self.selected])
-        return np.concatenate([X, x_pca], axis=1)
+        x_pca = self.pca.transform(X[self.selected])
+        x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in range(x_pca.shape[1])])
+        return pd.concat([X, x_pca], axis=1)
 
 
 class CatCount(Primitive):
-    count_dict = {}
+    count_dict = None
 
     def _fit(self, X, y=None):
+        self.count_dict = {}
         for col in self.selected:
-            self.count_dict[col] = collections.Counter(X[:, col])
+            self.count_dict[col] = collections.Counter(X[col])
         return self
 
     def _transform(self, X, y=None):
         for col in self.selected:
-            gen_freq = np.vectorize(lambda key: self.count_dict[col][key])
-            X[:, col] = gen_freq(X[:, col])
+            X[col] = X[col].apply(lambda key: self.count_dict[col][key])
         return X
 
 
@@ -339,21 +462,24 @@ def _fit(self, X, y=None):
 
     def _transform(self, X, y=None):
         for col in self.selected:
-            X[:, col] = np.square(np.log(X[:, col]))
+            X[col] = np.square(np.log(1 + X[col]))
         return X
 
 
 class Imputation(Primitive):
-    imputer = None
+    impute_dict = None
 
     def _fit(self, X, y=None):
-        # TODO implement most_frequent
-        self.imputer = SimpleImputer(strategy='constant', fill_value=0)
-        self.imputer.fit(X)
+        self.impute_dict = {}
+        for col in self.selected:
+            value_counts = X[col].value_counts()
+            self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0
         return self
 
     def _transform(self, X, y=None):
-        return self.imputer.transform(X)
+        for col in self.selected:
+            X[col] = X[col].fillna(self.impute_dict[col])
+        return X
 
 
 class FeatureFilter(Primitive):
@@ -364,7 +490,7 @@ def __init__(self, selected_columns=[], selected_type=None, threshold=0.001):
 
     def _fit(self, X, y=None):
         for col in self.selected:
-            mu = abs(pearsonr(X[:, col], y)[0])
+            mu = abs(pearsonr(X[col], y)[0])
             if np.isnan(mu):
                 mu = 0
             if mu < self.threshold:
@@ -372,7 +498,7 @@ def _fit(self, X, y=None):
         return self
 
     def _transform(self, X, y=None):
-        X = np.delete(X, self.drop_columns, axis=1)
+        X.drop(columns=self.drop_columns, inplace=True)
         return X
 
 
@@ -397,7 +523,8 @@ def _fit(self, X, y=None):
                                            n_jobs=1,
                                            num_class=n_classes,
                                            objective='multiclass')
-        elif self.task_type == 'regression':
+        else:
+            # self.task_type == 'regression'
             estimator = LGBMRegressor(silent=False,
                                       verbose=-1,
                                       n_jobs=1,
@@ -405,11 +532,11 @@ def _fit(self, X, y=None):
         estimator.fit(X, y)
         feature_importance = estimator.feature_importances_
         feature_importance = feature_importance/feature_importance.mean()
-        self.drop_columns = np.where(feature_importance < self.threshold)[0]
+        self.drop_columns = X.columns[np.where(feature_importance < self.threshold)[0]]
         return self
 
     def _transform(self, X, y=None):
-        X = np.delete(X, self.drop_columns, axis=1)
+        X.drop(columns=self.drop_columns, inplace=True)
         return X
 
 
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 30d8dec..317bef4 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -285,6 +285,6 @@ def get_box_plot(data, task_id, metric):
     # ak.run_automation(classification_task_list)
     import time
     t1 = time.time()
-    ak.evaluate(45)
+    ak.evaluate(3020)
     t2 = time.time()
-    print(t2-t1)
\ No newline at end of file
+    print(t2-t1)

From e10541ddcb2fcbd0b8a9d061a3e9ba7b78d1e851 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Wed, 4 Sep 2019 19:35:37 -0500
Subject: [PATCH 13/31] Use Tabular Data

---
 autokaggle/preprocessor.py | 93 ++++++++++++++++----------------------
 examples/benchmarking.py   |  2 +-
 2 files changed, 40 insertions(+), 55 deletions(-)

diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index bead35b..f0c655b 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -1,7 +1,6 @@
 import numpy as np
 import pandas as pd
 import scipy
-import math
 import itertools
 from scipy.stats import pearsonr
 from sklearn.decomposition import PCA
@@ -17,10 +16,11 @@
 
 
 class TabularPreprocessor:
-    def __init__(self):
+    def __init__(self, verbose=True):
         """
         Initialization function for tabular preprocessor.
         """
+        self.verbose = verbose
         self.num_cat_pair = {}
 
         self.total_samples = 0
@@ -61,30 +61,13 @@ def fit(self, raw_x, y, time_limit, data_info):
         self.budget = time_limit
         # Extract or read data info
         self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x)
-        print('DATA_INFO: {}'.format(self.data_info))
 
         # Set the meta info for each data type
-        self.n_time = sum(self.data_info == 'TIME')
-        self.n_num = sum(self.data_info == 'NUM')
-        self.n_cat = sum(self.data_info == 'CAT')
-        self.total_samples = raw_x.shape[0]
-
-        self.cat_col = list(np.where(self.data_info == 'CAT')[0])
-        self.num_col = list(np.where(self.data_info == 'NUM')[0])
-        self.time_col = list(np.where(self.data_info == 'TIME')[0])
-        self.cat_col = [str(i) for i in self.cat_col]
-        self.num_col = [str(i) for i in self.num_col]
-        self.time_col = [str(i) for i in self.time_col]
-
-        print('#TIME features: {}'.format(self.n_time))
-        print('#NUM features: {}'.format(self.n_num))
-        print('#CAT features: {}'.format(self.n_cat))
-        
-        # Convert sparse to dense if needed
-        raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
+        self.cat_col = [str(i) for i in np.where(self.data_info == 'CAT')[0]]
+        self.num_col = [str(i) for i in np.where(self.data_info == 'NUM')[0]]
+        self.time_col = [str(i) for i in np.where(self.data_info == 'TIME')[0]]
 
-        # To pandas
-        raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
+        data = TabularData(raw_x, self.data_info, self.verbose)
 
         self.pipeline = Pipeline([
             # ('cat_num_encoder', CatNumEncoder(selected_columns=self.cat_col, selected_num=self.num_col)),
@@ -102,7 +85,7 @@ def fit(self, raw_x, y, time_limit, data_info):
             ('pearson_corr', FeatureFilter(selected_columns=self.time_col + self.num_col + self.cat_col)),
             ('lgbm_feat_selection', FeatureImportance(selected_columns=self.time_col + self.num_col + self.cat_col)),
         ])
-        self.pipeline.fit(raw_x, y)
+        self.pipeline.fit(data.X, y)
 
         return self
 
@@ -126,12 +109,8 @@ def transform(self, raw_x, time_limit=None):
         else:
             self.budget = time_limit
 
-        # Convert sparse to dense if needed
-        raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
-
-        # To pandas
-        raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
-        return self.pipeline.transform(raw_x).values
+        data = TabularData(raw_x, self.data_info, self.verbose)
+        return self.pipeline.transform(data.X).values
 
     @staticmethod
     def extract_data_info(raw_x):
@@ -153,27 +132,33 @@ def extract_data_info(raw_x):
 
 
 class TabularData:
-    def __init__(self, data, data_info):
-        self.data = data
+    def __init__(self, raw_x, data_info, verbose=True):
         self.data_info = data_info
+        self.verbose = verbose
+        self.cat_col = [str(i) for i in np.where(self.data_info == 'CAT')[0]]
+        self.num_col = [str(i) for i in np.where(self.data_info == 'NUM')[0]]
+        self.time_col = [str(i) for i in np.where(self.data_info == 'TIME')[0]]
+
+        self.n_time = len(self.time_col)
+        self.n_num = len(self.num_col)
+        self.n_cat = len(self.cat_col)
+        self.total_samples = raw_x.shape[0]
 
-        self.total_samples = 0
-
-        self.cat_to_int_label = {}
-        self.n_first_batch_keys = {}
-        self.high_level_cat_keys = []
+        # Convert sparse to dense if needed
+        raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
 
-        self.num_cat_pair = {}
-        self.feature_add_high_cat = 0
-        self.feature_add_cat_num = 10
-        self.feature_add_cat_cat = 10
-        self.order_num_cat_pair = {}
+        # To pandas
+        if type(raw_x) != pd.DataFrame:
+            raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
 
-        self.selected_cols = None
+        self.X = raw_x
+        self.cat_cardinality = {}
 
-        self.n_time = None
-        self.n_num = None
-        self.n_cat = None
+        if self.verbose:
+            print('DATA_INFO: {}'.format(self.data_info))
+            print('#TIME features: {}'.format(self.n_time))
+            print('#NUM features: {}'.format(self.n_num))
+            print('#CAT features: {}'.format(self.n_cat))
 
 
 class Primitive(BaseEstimator, TransformerMixin):
@@ -181,23 +166,23 @@ def __init__(self, selected_columns=[], selected_type=None):
         self.selected = selected_columns
         self.selected_type = selected_type
 
-    def fit(self, X, y=None):
-        self.selected = list(set(X.columns)  & set(self.selected))
+    def fit(self, data, y=None):
+        self.selected = list(set(data.columns) & set(self.selected))
         if not self.selected:
             return self
-        return self._fit(X, y)
+        return self._fit(data, y)
 
-    def transform(self, X, y=None):
+    def transform(self, data, y=None):
         if not self.selected:
-            return X
-        return self._transform(X, y)
+            return data
+        return self._transform(data, y)
 
     @abstractmethod
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         pass
 
     @abstractmethod
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
         pass
 
 
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 317bef4..50bcfbb 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -285,6 +285,6 @@ def get_box_plot(data, task_id, metric):
     # ak.run_automation(classification_task_list)
     import time
     t1 = time.time()
-    ak.evaluate(3020)
+    ak.evaluate(3021)
     t2 = time.time()
     print(t2-t1)

From 2e97035a4d037ed723c1502a337be84a3596d636 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 5 Sep 2019 10:18:34 -0500
Subject: [PATCH 14/31] Save changes

---
 autokaggle/ensemblers.py   |  56 +++---
 autokaggle/preprocessor.py | 371 ++++++++++++++++++++++---------------
 2 files changed, 257 insertions(+), 170 deletions(-)

diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index e931954..1e5fe88 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -14,6 +14,8 @@
 from joblib import dump, load
 from scipy import stats
 from lightgbm import LGBMClassifier, LGBMRegressor
+import collections
+from sklearn.model_selection import RandomizedSearchCV, cross_val_score
 
 
 class RankedEnsembler:
@@ -73,52 +75,59 @@ def predict(self, X):
 
 
 class EnsembleSelection:
-    def __init__(self, ensemble_size=25):
-        self.ensemble_size = ensemble_size
-        
-    def fit(self, predictions, y_true):
+    indices_ = None
+    weights_ = None
+
+    def __init__(self, estimator_list, objective, ensemble_size=25):
+        self.estimator_list = estimator_list
+        self.objective = objective
+        self.indices_, self.weights_ = [], []
+        self.ensemble_size = min(len(estimator_list), ensemble_size)
+        if self.objective == 'regression':
+            self.score_metric = 'neg_mean_squared_error'
+            self.skf = KFold(n_splits=3, shuffle=True, random_state=1001)
+        else:
+            self.score_metric = 'neg_mean_squared_error'
+            self.skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1001)
+
+    def fit(self, X, y_true):
         """Rich Caruana's ensemble selection method. (fast version)"""
         ensemble = []
         trajectory = []
         order = []
 
         for i in range(self.ensemble_size):
-            scores = np.zeros((len(predictions)))
+            scores = np.zeros((len(self.estimator_list)))
             s = len(ensemble)
             if s == 0:
-                weighted_ensemble_prediction = np.zeros(predictions[0].shape)
+                weighted_ensemble_prediction = np.zeros(len(self.estimator_list))
             else:
                 # Memory-efficient averaging!
                 ensemble_prediction = np.zeros(ensemble[0].shape)
-                for pred in ensemble:
-                    ensemble_prediction += pred
+                for est in ensemble:
+                    ensemble_prediction += est
                 ensemble_prediction /= s
 
-                weighted_ensemble_prediction = (s / float(s + 1)) * \
-                                               ensemble_prediction
+                weighted_ensemble_prediction = (s / float(s + 1)) * ensemble_prediction
             fant_ensemble_prediction = np.zeros(weighted_ensemble_prediction.shape)
-            for j, pred in enumerate(predictions):
+            for j, est in enumerate(self.estimator_list):
                 fant_ensemble_prediction[:,:] = weighted_ensemble_prediction + \
-                                             (1. / float(s + 1)) * pred
-                scores[j] = calculate_score(
-                    solution=labels,
-                    prediction=fant_ensemble_prediction,
-                    task_type=self.task_type,
-                    metric=self.metric,
-                    all_scoring_functions=False)
+                                             (1. / float(s + 1)) * est
+                scores[j] = cross_val_score(self.estimator_list[j], X, y_true, scoring=self.score_metric,
+                                            cv=self.skf).mean()
 
             all_best = np.argwhere(scores == np.nanmin(scores)).flatten()
             best = np.random.RandomState.choice(all_best)
-            ensemble.append(predictions[best])
+            ensemble.append(self.estimator_list[best])
             trajectory.append(scores[best])
             order.append(best)
 
             # Handle special case
-            if len(predictions) == 1:
+            if len(self.estimator_list) == 1:
                 break
 
         self.indices_ = order
-        ensemble_members = Counter(self.indices_).most_common()
+        ensemble_members = collections.Counter(self.indices_)
         weights = np.zeros((self.ensemble_size,), dtype=float)
         for ensemble_member in ensemble_members:
             weight = float(ensemble_member[1]) / self.ensemble_size
@@ -128,7 +137,6 @@ def fit(self, predictions, y_true):
             weights = weights / np.sum(weights)
 
         self.weights_ = weights
-        self._calculate_weights()
     
-    def predict(self, predictions):
-        return np.average(predictions, axis=1, weights=self.weights_)
+    def predict(self, X):
+        return np.average(X, axis=1, weights=self.weights_)
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index f0c655b..d2ac6d8 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -21,11 +21,8 @@ def __init__(self, verbose=True):
         Initialization function for tabular preprocessor.
         """
         self.verbose = verbose
-        self.num_cat_pair = {}
 
         self.total_samples = 0
-
-        self.cat_to_int_label = {}
         self.n_first_batch_keys = {}
         self.high_level_cat_keys = []
 
@@ -34,15 +31,8 @@ def __init__(self, verbose=True):
         self.feature_add_cat_cat = 10
         self.order_num_cat_pair = {}
 
-        self.selected_cols = None
         self.budget = None
         self.data_info = None
-        self.n_time = None
-        self.n_num = None
-        self.n_cat = None
-        self.cat_col = None
-        self.num_col = None
-        self.time_col = None
         self.pipeline = None
 
     def fit(self, raw_x, y, time_limit, data_info):
@@ -62,30 +52,27 @@ def fit(self, raw_x, y, time_limit, data_info):
         # Extract or read data info
         self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x)
 
-        # Set the meta info for each data type
-        self.cat_col = [str(i) for i in np.where(self.data_info == 'CAT')[0]]
-        self.num_col = [str(i) for i in np.where(self.data_info == 'NUM')[0]]
-        self.time_col = [str(i) for i in np.where(self.data_info == 'TIME')[0]]
-
         data = TabularData(raw_x, self.data_info, self.verbose)
 
         self.pipeline = Pipeline([
-            # ('cat_num_encoder', CatNumEncoder(selected_columns=self.cat_col, selected_num=self.num_col)),
-            ('cat_encoder', TargetEncoder(selected_columns=self.cat_col)),
-            # ('cat_cat_encoder', CatCatEncoder(selected_columns=self.cat_col)),
-            ('imputer', Imputation(selected_columns=self.cat_col + self.num_col + self.time_col)),
-            ('scaler', TabScaler(selected_columns=self.num_col)),
-            ('boxcox', BoxCox(selected_columns=self.num_col)),
-            ('binning', Binning(selected_columns=self.num_col)),
-            ('log_square', LogTransform(selected_columns=self.num_col)),
-            ('pca', TabPCA(selected_columns=self.num_col)),
-            ('time_diff', TimeDiff(selected_columns=self.time_col)),
-            ('time_offset', TimeOffset(selected_columns=self.time_col)),
-            ('filter', FilterConstant(selected_columns=self.time_col + self.num_col + self.cat_col)),
-            ('pearson_corr', FeatureFilter(selected_columns=self.time_col + self.num_col + self.cat_col)),
-            ('lgbm_feat_selection', FeatureImportance(selected_columns=self.time_col + self.num_col + self.cat_col)),
+            ('imputer', Imputation(selected_type='ALL', operation='upd')),
+            ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')),
+            ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')),
+            ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')),
+            ('count_encoder', CatCount(selected_type='CAT', operation='add')),
+            ('label_encoder', CatEncoder(selected_type='CAT', operation='add')),
+            ('boxcox', BoxCox(selected_type='NUM', operation='upd')),
+            ('log_square', LogTransform(selected_type='NUM', operation='upd')),
+            ('scaler', TabScaler(selected_type='NUM', operation='upd')),
+            ('binning', Binning(selected_type='NUM', operation='add')),
+            ('pca', TabPCA(selected_type='NUM', operation='add')),
+            ('time_diff', TimeDiff(selected_type='TIME', operation='upd')),
+            ('time_offset', TimeOffset(selected_type='TIME', operation='upd')),
+            ('filter', FilterConstant(selected_type='ALL', operation='del')),
+            ('pearson_corr', FeatureFilter(selected_type='ALL', operation='del')),
+            ('lgbm_feat_selection', FeatureImportance(selected_type='ALL', operation='del')),
         ])
-        self.pipeline.fit(data.X, y)
+        self.pipeline.fit(data, y)
 
         return self
 
@@ -110,7 +97,7 @@ def transform(self, raw_x, time_limit=None):
             self.budget = time_limit
 
         data = TabularData(raw_x, self.data_info, self.verbose)
-        return self.pipeline.transform(data.X).values
+        return self.pipeline.transform(data).X.values
 
     @staticmethod
     def extract_data_info(raw_x):
@@ -132,27 +119,31 @@ def extract_data_info(raw_x):
 
 
 class TabularData:
+    cat_col = None
+    num_col = None
+    time_col = None
+    n_cat, n_time, n_num = 0, 0, 0
+    cat_cardinality = None
+    generated_features = None
+    feature_options = None
+    num_info = None
+
     def __init__(self, raw_x, data_info, verbose=True):
-        self.data_info = data_info
         self.verbose = verbose
-        self.cat_col = [str(i) for i in np.where(self.data_info == 'CAT')[0]]
-        self.num_col = [str(i) for i in np.where(self.data_info == 'NUM')[0]]
-        self.time_col = [str(i) for i in np.where(self.data_info == 'TIME')[0]]
-
-        self.n_time = len(self.time_col)
-        self.n_num = len(self.num_col)
-        self.n_cat = len(self.cat_col)
+        self.data_info = {str(i): data_info[i] for i in range(len(data_info))}
         self.total_samples = raw_x.shape[0]
+        self.refresh_col_types()
 
         # Convert sparse to dense if needed
         raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
 
-        # To pandas
+        # To pandas Dataframe
         if type(raw_x) != pd.DataFrame:
             raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
 
         self.X = raw_x
         self.cat_cardinality = {}
+        self.update_cat_cardinality()
 
         if self.verbose:
             print('DATA_INFO: {}'.format(self.data_info))
@@ -160,14 +151,70 @@ def __init__(self, raw_x, data_info, verbose=True):
             print('#NUM features: {}'.format(self.n_num))
             print('#CAT features: {}'.format(self.n_cat))
 
+    def update_type(self, columns, new_type):
+        if not new_type:
+            return
+        for c in columns:
+            self.data_info[c] = new_type
+
+    def delete_type(self, columns):
+        for c in columns:
+            _ = self.data_info.pop(c, 0)
+
+    def update(self, operation, columns, x_tr, new_type=None):
+        if operation == 'upd':
+            if x_tr is not None:
+                self.X[columns] = x_tr
+            self.update_type(columns, new_type)
+        elif operation == 'add':
+            if x_tr is not None:
+                self.X = pd.concat([self.X, x_tr], axis=1)
+                self.update_type(x_tr.columns, new_type)
+        elif operation == 'del':
+            if len(columns) != 0:
+                self.X.drop(columns, inplace=True)
+                self.delete_type(columns)
+        else:
+            print("invalid operation")
+        self.refresh_col_types()
+
+    def refresh_col_types(self):
+        self.cat_col = [k for k, v in self.data_info.items() if v == 'CAT']
+        self.num_col = [k for k, v in self.data_info.items() if v == 'NUM']
+        self.time_col = [k for k, v in self.data_info.items() if v == 'TIME']
+        self.n_time = len(self.time_col)
+        self.n_num = len(self.num_col)
+        self.n_cat = len(self.cat_col)
+
+    def update_cat_cardinality(self):
+        for c in self.cat_col:
+            self.cat_cardinality[c] = len(set(self.X[c]))
+
+    def select_columns(self, data_type):
+        self.refresh_col_types()
+        if data_type == 'CAT':
+            return self.cat_col
+        elif data_type == 'TIME':
+            return self.time_col
+        elif data_type == 'NUM':
+            return self.num_col
+        elif data_type == 'ALL':
+            return list(self.data_info.keys())
+        else:
+            print('invalid Type')
+            return []
+
 
 class Primitive(BaseEstimator, TransformerMixin):
-    def __init__(self, selected_columns=[], selected_type=None):
-        self.selected = selected_columns
+    selected = None
+    drop_columns = None
+
+    def __init__(self, selected_type=None, operation='upd', **kwargs):
         self.selected_type = selected_type
+        self.operation = operation
 
     def fit(self, data, y=None):
-        self.selected = list(set(data.columns) & set(self.selected))
+        self.selected = data.select_columns(self.selected_type)
         if not self.selected:
             return self
         return self._fit(data, y)
@@ -186,54 +233,67 @@ def _transform(self, data, y=None):
         pass
 
 
+class PrimitiveHigherOrder(Primitive):
+    def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs):
+        self.selected_type1 = selected_type1
+        self.selected_type2 = selected_type2
+        self.operation = operation
+        self.options = kwargs
+
+
 class TabScaler(Primitive):
     scaler = None
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         self.scaler = StandardScaler()
-        self.scaler.fit(X[self.selected], y)
+        self.scaler.fit(data.X[self.selected], y)
         return self
 
-    def _transform(self, X, y=None):
-        X[self.selected] = self.scaler.transform(X[self.selected])
-        return X
+    def _transform(self, data, y=None):
+        x_tr = self.scaler.transform(data.X[self.selected])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
 
 class BoxCox(Primitive):
     transformer = None
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         self.transformer = PowerTransformer()
-        self.transformer.fit(X[self.selected], y)
+        self.transformer.fit(data.X[self.selected], y)
         return self
 
-    def _transform(self, X, y=None):
-        X[self.selected] = self.transformer.transform(X[self.selected])
-        return X
+    def _transform(self, data, y=None):
+        x_tr = self.transformer.transform(data.X[self.selected])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
 
 class Binning(Primitive):
     binner = None
 
-    def __init__(self, selected_columns=[], selected_type=None, strategy='quantile', encoding='ordinal'):
-        super().__init__(selected_columns, selected_type)
+    def __init__(self, selected_type=None, operation='upd', strategy='quantile', encoding='ordinal'):
+        super().__init__(selected_type, operation)
         self.strategy = strategy
         self.encoding = encoding
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         self.binner = KBinsDiscretizer(strategy=self.strategy, encode=self.encoding)
-        self.binner.fit(X[self.selected], y)
+        self.binner.fit(data.X[self.selected], y)
         return self
 
-    def _transform(self, X, y=None):
-        X[self.selected] = self.binner.transform(X[self.selected])
-        return X
+    def _transform(self, data, y=None):
+        x_tr = self.binner.transform(data.X[self.selected])
+        # TODO: decide if cat or num new type
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
 
 class CatEncoder(Primitive):
     cat_to_int_label = None
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
+        X = data.X
         self.cat_to_int_label = {}
         for col_index in self.selected:
             self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
@@ -243,12 +303,13 @@ def _fit(self, X, y=None):
                     self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index])
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
+        X = data.X
         for col_index in self.selected:
             for row_index in range(len(X)):
                 key = str(X[row_index, col_index])
                 X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan)
-        return X
+        return data
 
 
 class TargetEncoder(Primitive):
@@ -268,7 +329,8 @@ def calc_smooth_mean(df, by, on, alpha=5):
         smooth = (counts * means + alpha * mean) / (counts + alpha)
         return smooth
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
+        X = data.X
         self.target_encoding_map = {}
         X['target'] = y
         for col in self.selected:
@@ -276,18 +338,15 @@ def _fit(self, X, y=None):
         X.drop('target', axis=1, inplace=True)
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
         for col in self.selected:
-            X[col] = X[col].map(self.target_encoding_map[col])
-        return X
-
+            x_tr[col] = data.X[col].map(self.target_encoding_map[col])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
-class CatCatEncoder(Primitive):
-    def __init__(self, selected_columns=[], selected_type=None, strategy='count'):
-        super().__init__(selected_columns, selected_type)
-        self.strategy = strategy
-        self.cat_cat_map = {}
 
+class CatCatEncoder(PrimitiveHigherOrder):
     @staticmethod
     def cat_cat_count(df, col1, col2, strategy='count'):
         if strategy == 'count':
@@ -298,21 +357,26 @@ def cat_cat_count(df, col1, col2, strategy='count'):
             mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[col2].nunique()
         return mapping
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
+        self.cat_cat_map = {}
+        self.strategy = self.options.get('strategy', 'count')
         for col1, col2 in itertools.combinations(self.selected, 2):
-            self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(X, col1, col2, self.strategy)
+            self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(data.X, col1, col2, self.strategy)
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
         for col1, col2 in itertools.combinations(self.selected, 2):
             if col1 + '_cross_' + col2 in self.cat_cat_map:
-                X[col1 + '_cross_' + col2] = X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2])
-        return X
+                x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2])
+        # TODO: decide new_type
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
 
-class CatNumEncoder(Primitive):
-    def __init__(self, selected_columns=[], selected_type=None, selected_num=[], strategy='mean'):
-        super().__init__(selected_columns, selected_type)
+class CatNumEncoder(PrimitiveHigherOrder):
+    def __init__(self, selected_type=None, selected_num=[], operation='add', strategy='mean'):
+        super().__init__(selected_type, operation)
         self.selected_num = selected_num
         self.strategy = strategy
         self.cat_num_map = {}
@@ -332,27 +396,25 @@ def cat_num_interaction(df, col1, col2, method='mean'):
 
         return mapping
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
+        self.cat_num_map = {}
+        self.strategy = self.options.get('strategy', 'mean')
         for col1 in self.selected:
             for col2 in self.selected_num:
-                self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(X, col1, col2, self.strategy)
+                self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(data.X, col1, col2, self.strategy)
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
         for col1 in self.selected:
             for col2 in self.selected_num:
                 if col1 + '_cross_' + col2 in self.cat_num_map:
-                    X[col1 + '_cross_' + col2] = X[col1].map(self.cat_num_map[col1 + '_cross_' + col2])
-        return X
-
+                    x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_num_map[col1 + '_cross_' + col2])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
-class CatBinEncoder(Primitive):
-    def __init__(self, selected_columns=[], selected_type=None, selected_bin=[], strategy='percent_true'):
-        super().__init__(selected_columns, selected_type)
-        self.selected_bin = selected_bin
-        self.strategy = strategy
-        self.cat_bin_map = {}
 
+class CatBinEncoder(PrimitiveHigherOrder):
     @staticmethod
     def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
         if strategy == 'percent_true':
@@ -363,138 +425,155 @@ def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
             mapping = df.groupby([col1])[col2].mean()
         return mapping
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
+        self.cat_bin_map = {}
+        self.strategy = self.options.get('strategy', 'percent_true')
         for col1 in self.selected:
             for col2 in self.selected_bin:
-                self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(X, col1, col2, self.strategy)
+                self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(data.X, col1, col2, self.strategy)
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
         for col1 in self.selected:
             for col2 in self.selected_bin:
                 if col1 + '_cross_' + col2 in self.cat_bin_map:
-                    X[col1 + '_cross_' + col2] = X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2])
-        return X
+                    x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
 
 class FilterConstant(Primitive):
-    selected_cols = None
+    drop_columns = None
 
-    def _fit(self, X, y=None):
-        self.selected_cols = X.columns[(X.max(axis=0) - X.min(axis=0) != 0)].tolist()
+    def _fit(self, data, y=None):
+        X = data.X
+        self.drop_columns = X.columns[(X.max(axis=0) - X.min(axis=0) == 0)].tolist()
         return self
 
-    def _transform(self, X, y=None):
-        return X[self.selected_cols]
+    def _transform(self, data, y=None):
+        data.update(self.operation, self.drop_columns, None, new_type=None)
+        return data
 
 
 class TimeDiff(Primitive):
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
         for a, b in itertools.combinations(self.selected, 2):
-            X[a + '-' + b] = X[a] - X[b]
-        return X
+            x_tr[a + '-' + b] = data.X[a] - data.X[b]
+        data.update(self.operation, self.selected, x_tr, new_type='TIME')
+        return data
 
 
 class TimeOffset(Primitive):
     start_time = None
 
-    def _fit(self, X, y=None):
-        self.start_time = X[self.selected].min(axis=0)
+    def _fit(self, data, y=None):
+        self.start_time = data.X[self.selected].min(axis=0)
         return self
 
-    def _transform(self, X, y=None):
-        X[self.selected] = X[self.selected] - self.start_time
-        return X
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        x_tr[self.selected] = data.X[self.selected] - self.start_time
+        data.update(self.operation, self.selected, x_tr, new_type='TIME')
+        return data
 
 
 class TabPCA(Primitive):
     pca = None
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         self.pca = PCA(n_components=0.99, svd_solver='full')
-        self.pca.fit(X[self.selected])
+        self.pca.fit(data.X[self.selected])
         return self
 
-    def _transform(self, X, y=None):
-        x_pca = self.pca.transform(X[self.selected])
+    def _transform(self, data, y=None):
+        x_pca = self.pca.transform(data.X[self.selected])
         x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in range(x_pca.shape[1])])
-        return pd.concat([X, x_pca], axis=1)
+        data.update(self.operation, self.selected, x_pca, new_type='NUM')
+        return data
 
 
 class CatCount(Primitive):
     count_dict = None
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         self.count_dict = {}
         for col in self.selected:
-            self.count_dict[col] = collections.Counter(X[col])
+            self.count_dict[col] = collections.Counter(data.X[col])
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
         for col in self.selected:
-            X[col] = X[col].apply(lambda key: self.count_dict[col][key])
-        return X
+            x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col][key])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
 
 class LogTransform(Primitive):
+    name_key = 'log_'
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
         for col in self.selected:
-            X[col] = np.square(np.log(1 + X[col]))
-        return X
+            x_tr[self.name_key + col] = np.square(np.log(1 + data.X[col]))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        return data
 
 
 class Imputation(Primitive):
     impute_dict = None
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         self.impute_dict = {}
         for col in self.selected:
-            value_counts = X[col].value_counts()
+            value_counts = data.X[col].value_counts()
             self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0
         return self
 
-    def _transform(self, X, y=None):
+    def _transform(self, data, y=None):
         for col in self.selected:
-            X[col] = X[col].fillna(self.impute_dict[col])
-        return X
+            data.X[col].fillna(self.impute_dict[col])
+        data.update(self.operation, self.selected, None, new_type='NUM')
+        return data
 
 
 class FeatureFilter(Primitive):
-    def __init__(self, selected_columns=[], selected_type=None, threshold=0.001):
-        super().__init__(selected_columns, selected_type)
+    def __init__(self, selected_type=None, operation='del', threshold=0.001):
+        super().__init__(selected_type, operation)
         self.threshold = threshold
         self.drop_columns = []
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         for col in self.selected:
-            mu = abs(pearsonr(X[col], y)[0])
+            mu = abs(pearsonr(data.X[col], y)[0])
             if np.isnan(mu):
                 mu = 0
             if mu < self.threshold:
                 self.drop_columns.append(col)
         return self
 
-    def _transform(self, X, y=None):
-        X.drop(columns=self.drop_columns, inplace=True)
-        return X
+    def _transform(self, data, y=None):
+        data.update(self.operation, self.drop_columns, None, new_type=None)
+        return data
 
 
 class FeatureImportance(Primitive):
-    def __init__(self, selected_columns=[], selected_type=None, threshold=0.001, task_type='classification'):
-        super().__init__(selected_columns, selected_type)
+    def __init__(self, selected_type=None, operation='del', threshold=0.001, task_type='classification'):
+        super().__init__(selected_type, operation)
         self.threshold = threshold
         self.drop_columns = []
         self.task_type = task_type
 
-    def _fit(self, X, y=None):
+    def _fit(self, data, y=None):
         if self.task_type == 'classification':
             n_classes = len(set(y))
             if n_classes == 2:
@@ -514,15 +593,15 @@ def _fit(self, X, y=None):
                                       verbose=-1,
                                       n_jobs=1,
                                       objective='regression')
-        estimator.fit(X, y)
+        estimator.fit(data.X, y)
         feature_importance = estimator.feature_importances_
         feature_importance = feature_importance/feature_importance.mean()
-        self.drop_columns = X.columns[np.where(feature_importance < self.threshold)[0]]
+        self.drop_columns = data.X.columns[np.where(feature_importance < self.threshold)[0]]
         return self
 
-    def _transform(self, X, y=None):
-        X.drop(columns=self.drop_columns, inplace=True)
-        return X
+    def _transform(self, data, y=None):
+        data.update(self.operation, self.drop_columns, None, new_type=None)
+        return data
 
 
 if __name__ == "__main__":

From 4c7c0fb3bf977f31cba8985cada332f72e0586cc Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Sun, 15 Sep 2019 15:48:09 -0500
Subject: [PATCH 15/31] Fix the prep pipeline

---
 autokaggle/auto_ml.py      |   7 ++-
 autokaggle/estimators.py   | 113 ++++++++++++++++++++---------------
 autokaggle/preprocessor.py | 119 ++++++++++++++++++++++++-------------
 examples/benchmarking.py   |   2 +-
 4 files changed, 150 insertions(+), 91 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 7cfcbff..97e0ea3 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -67,13 +67,16 @@ def fit(self, x, y, time_limit=None, data_info=None):
             print("The whole available data is: ")
             print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1]))
 
-    def predict(self, x_test):
+    def predict(self, x_test, predict_proba=False):
         """
         This function should provide predictions of labels on (test) data.
         The function predict eventually can return probabilities or continuous values.
         """
         x_test = self.preprocessor.transform(x_test)
-        y = self.model.predict(x_test, )
+        if predict_proba:
+            y = self.model.predict_proba(x_test, )
+        else:
+            y = self.model.predict(x_test, )
         if y is None:
             raise ValueError("Tabular predictor does not exist")
         return y
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index dccff50..6d34553 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -24,42 +24,46 @@
 
 
 # TODO: Way to change the default hparams
-knn_classifier_params = {'n_neighbors': hp.choice('n_neighbors', range(2, 20)),
-                         'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
-                         'leaf_size': hp.choice('leaf_size', range(5, 50)),
-                         'metric': hp.choice('metric', ["euclidean", "manhattan",
-                                                        "chebyshev", "minkowski"
-                                                        ]),
-                         'p': hp.choice('p', range(1, 4)),
-                         }
-
-svc_params = {'C': hp.lognormal('C', 0, 1),
-              'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
-              'degree': hp.choice('degree', range(1, 6)),
-              'gamma': hp.uniform('gamma', 0.001, 10000),
-              'max_iter': 50000,
-              }
-
-random_forest_classifier_params = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
-                                   'max_features': hp.uniform('max_features', 0, 1.0),
-                                   'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
-                                   'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))}
-
-lgbm_classifier_params = {'boosting_type': 'gbdt',
-                          'min_split_gain': 0.1,
-                          'subsample': 0.8,
-                          'num_leaves': 80,
-                          'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
-                          'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
-                          'max_depth': hp.choice('max_depth', range(5, 10)),
-                          'n_estimators': hp.choice('n_estimators', range(50, 200)),
-                          'learning_rate': hp.lognormal('learning_rate', 0, 1),
-                          }
-
-adaboost_classifier_params = {'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
-                              'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
-                              'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
-                              }
+knn_classifier_params = {
+    'n_neighbors': hp.choice('n_neighbors', range(2, 20)),
+    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
+    'leaf_size': hp.choice('leaf_size', range(5, 50)),
+    'metric': hp.choice('metric', ["euclidean", "manhattan", "chebyshev", "minkowski"]),
+    'p': hp.choice('p', range(1, 4)),
+}
+
+svc_params = {
+    'C': hp.lognormal('C', 0, 1),
+    'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
+    'degree': hp.choice('degree', range(1, 6)),
+    'gamma': hp.uniform('gamma', 0.001, 10000),
+    'max_iter': 50000,
+}
+
+random_forest_classifier_params = {
+    'criterion': hp.choice('criterion', ['entropy', 'gini']),
+    'max_features': hp.uniform('max_features', 0, 1.0),
+    'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
+    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
+}
+
+lgbm_classifier_params = {
+    'boosting_type': 'gbdt',
+    'min_split_gain': 0.1,
+    'subsample': 0.8,
+    'num_leaves': 80,
+    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
+    'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
+    'max_depth': hp.choice('max_depth', range(5, 10)),
+    'n_estimators': hp.choice('n_estimators', range(50, 200)),
+    'learning_rate': hp.lognormal('learning_rate', 0, 1),
+}
+
+adaboost_classifier_params = {
+    'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
+    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
+    'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
+}
 
 extra_trees_regressor_params = {
     'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)),
@@ -68,17 +72,20 @@
     'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)),
     'min_impurity_decrease': 0.0
 }
+
 ridge_params = {
     'fit_intercept': True,
     'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1),
     'alpha': hp.loguniform('alpha_ridge', 1e-5, 10)
 }
+
 random_forest_regressor_params = {
     'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']),
     'max_features': hp.uniform('max_features', 0, 1.0),
     'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
     'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
 }
+
 lgbm_regressor_params = {
     'boosting_type': 'gbdt',
     'min_split_gain': 0.1,
@@ -90,6 +97,7 @@
     'n_estimators': hp.choice('n_estimators', range(50, 200)),
     'learning_rate': hp.lognormal('learning_rate', 0, 1),
 }
+
 adaboost_regressor_params = {
     'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
     'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
@@ -98,7 +106,8 @@
 
 
 class TabularEstimator(BaseEstimator):
-    def __init__(self, path=None, verbose=True, time_limit=None):
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=25,
+                 ensemble_strategy='ranked_ensembling', ensemble_method='max_voting'):
         """
         Initialization function for tabular supervised learner.
         """
@@ -108,17 +117,17 @@ def __init__(self, path=None, verbose=True, time_limit=None):
         self.objective = None
         abs_cwd = os.path.split(os.path.abspath(__file__))[0]
         self.best_estimator_ = None
-        self.use_ensembling = False
+        self.use_ensembling = use_ensembling
         self.hparams = None
-        self.num_estimators_ensemble = 2
-        self.ensemble_strategy = 'ranked_ensembling'
-        self.ensemble_method = 'max_voting'
+        self.num_estimators_ensemble = num_estimators_ensemble
+        self.ensemble_strategy = ensemble_strategy
+        self.ensemble_method = ensemble_method
     
     def fit(self, x, y):
         if self.objective == 'classification':
             n_classes = len(set(y))
             self.objective = 'binary' if n_classes == 2 else 'multiclass'
-        x, y = self.resample(x, y)
+        # x, y = self.resample(x, y)
         self.best_estimator_, _ = self.search(x, y)
         self.best_estimator_.fit(x, y)
         self.save_model()
@@ -126,6 +135,13 @@ def fit(self, x, y):
     def predict(self, x, y=None):
         y_pred = self.best_estimator_.predict(x, )
         return y_pred
+
+    def predict_proba(self, x, y=None):
+        try:
+            y_pred = self.best_estimator_.predict_proba(x, )
+        except:
+            y_pred = self.best_estimator_.predict(x, )
+        return y_pred
     
     @staticmethod
     def resample(X, y):
@@ -144,13 +160,16 @@ def subsample(x, y, sample_percent):
         grid_train_x, grid_train_y = x[idx, :], y[idx]
         return grid_train_x, grid_train_y
 
-    def search(self, x, y, search_iter=4, folds=3, sample_percent=0.1):
+    def search(self, x, y, search_iter=100, folds=3, sample_percent=0.1):
         grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=sample_percent)
         score_metric, skf = self.get_skf(folds)
 
         def objective_func(args):
             clf = args['model'](**args['param'])
-            eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
+            try:
+                eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
+            except ValueError:
+                eval_score = 0
             if self.verbose:
                 print("CV Score:", eval_score)
                 print("\n=================")
@@ -206,9 +225,9 @@ def __init__(self, path=None, verbose=True, time_limit=None):
             {'model': LGBMClassifier,
              'param': lgbm_classifier_params
              },
-            {'model': AdaBoostClassifier,
-             'param': adaboost_classifier_params
-             }
+            # {'model': AdaBoostClassifier,
+            #  'param': adaboost_classifier_params
+            #  }
         ])
 
     def get_skf(self, folds):
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index d2ac6d8..c42f7f6 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -22,14 +22,9 @@ def __init__(self, verbose=True):
         """
         self.verbose = verbose
 
-        self.total_samples = 0
-        self.n_first_batch_keys = {}
-        self.high_level_cat_keys = []
-
         self.feature_add_high_cat = 0
         self.feature_add_cat_num = 10
         self.feature_add_cat_cat = 10
-        self.order_num_cat_pair = {}
 
         self.budget = None
         self.data_info = None
@@ -56,21 +51,21 @@ def fit(self, raw_x, y, time_limit, data_info):
 
         self.pipeline = Pipeline([
             ('imputer', Imputation(selected_type='ALL', operation='upd')),
-            ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')),
-            ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')),
-            ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')),
-            ('count_encoder', CatCount(selected_type='CAT', operation='add')),
-            ('label_encoder', CatEncoder(selected_type='CAT', operation='add')),
-            ('boxcox', BoxCox(selected_type='NUM', operation='upd')),
-            ('log_square', LogTransform(selected_type='NUM', operation='upd')),
+            # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')),
+            # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')),
+            # ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')),
+            ('count_encoder', CatCount(selected_type='CAT', operation='upd')),
+            # ('label_encoder', CatEncoder(selected_type='CAT', operation='add')),
+            # ('boxcox', BoxCox(selected_type='NUM', operation='upd')),
+            # ('log_square', LogTransform(selected_type='NUM', operation='upd')),
             ('scaler', TabScaler(selected_type='NUM', operation='upd')),
-            ('binning', Binning(selected_type='NUM', operation='add')),
-            ('pca', TabPCA(selected_type='NUM', operation='add')),
-            ('time_diff', TimeDiff(selected_type='TIME', operation='upd')),
-            ('time_offset', TimeOffset(selected_type='TIME', operation='upd')),
+            # ('binning', Binning(selected_type='NUM', operation='upd')),
+            # ('pca', TabPCA(selected_type='NUM', operation='add')),
+            # ('time_diff', TimeDiff(selected_type='TIME', operation='add')),
+            # ('time_offset', TimeOffset(selected_type='TIME', operation='upd')),
             ('filter', FilterConstant(selected_type='ALL', operation='del')),
-            ('pearson_corr', FeatureFilter(selected_type='ALL', operation='del')),
-            ('lgbm_feat_selection', FeatureImportance(selected_type='ALL', operation='del')),
+            # ('pearson_corr', FeatureFilter(selected_type='ALL', operation='del')),
+            # ('lgbm_feat_selection', FeatureImportance(selected_type='ALL', operation='del')),
         ])
         self.pipeline.fit(data, y)
 
@@ -97,7 +92,9 @@ def transform(self, raw_x, time_limit=None):
             self.budget = time_limit
 
         data = TabularData(raw_x, self.data_info, self.verbose)
-        return self.pipeline.transform(data).X.values
+        a = self.pipeline.transform(data).X
+        print(a.head())
+        return a.values
 
     @staticmethod
     def extract_data_info(raw_x):
@@ -142,7 +139,6 @@ def __init__(self, raw_x, data_info, verbose=True):
             raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
 
         self.X = raw_x
-        self.cat_cardinality = {}
         self.update_cat_cardinality()
 
         if self.verbose:
@@ -152,8 +148,6 @@ def __init__(self, raw_x, data_info, verbose=True):
             print('#CAT features: {}'.format(self.n_cat))
 
     def update_type(self, columns, new_type):
-        if not new_type:
-            return
         for c in columns:
             self.data_info[c] = new_type
 
@@ -161,18 +155,28 @@ def delete_type(self, columns):
         for c in columns:
             _ = self.data_info.pop(c, 0)
 
-    def update(self, operation, columns, x_tr, new_type=None):
+    def rename_cols(self, key):
+        def rename_fn(col_name):
+            col_name += '_' + key
+            while col_name in self.X.columns:
+                col_name += '_' + key
+            return col_name
+        return rename_fn
+
+    def update(self, operation, columns, x_tr, new_type=None, key=''):
         if operation == 'upd':
             if x_tr is not None:
                 self.X[columns] = x_tr
-            self.update_type(columns, new_type)
+            if new_type is not None:
+                self.update_type(columns, new_type)
         elif operation == 'add':
             if x_tr is not None:
+                x_tr = x_tr.rename(columns=self.rename_cols(key))
                 self.X = pd.concat([self.X, x_tr], axis=1)
                 self.update_type(x_tr.columns, new_type)
         elif operation == 'del':
             if len(columns) != 0:
-                self.X.drop(columns, inplace=True)
+                self.X.drop(columns=columns, inplace=True)
                 self.delete_type(columns)
         else:
             print("invalid operation")
@@ -187,6 +191,8 @@ def refresh_col_types(self):
         self.n_cat = len(self.cat_col)
 
     def update_cat_cardinality(self):
+        if not self.cat_cardinality:
+            self.cat_cardinality = {}
         for c in self.cat_col:
             self.cat_cardinality[c] = len(set(self.X[c]))
 
@@ -208,13 +214,22 @@ def select_columns(self, data_type):
 class Primitive(BaseEstimator, TransformerMixin):
     selected = None
     drop_columns = None
+    options = None
+    supported_ops = ('add', 'upd', 'del')
 
     def __init__(self, selected_type=None, operation='upd', **kwargs):
         self.selected_type = selected_type
         self.operation = operation
+        self.init_vars(**kwargs)
+
+    def init_vars(self, **kwargs):
+        self.options = kwargs
 
     def fit(self, data, y=None):
         self.selected = data.select_columns(self.selected_type)
+        if self.operation not in self.supported_ops:
+            print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__))
+            self.selected = None
         if not self.selected:
             return self
         return self._fit(data, y)
@@ -234,15 +249,14 @@ def _transform(self, data, y=None):
 
 
 class PrimitiveHigherOrder(Primitive):
-    def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs):
-        self.selected_type1 = selected_type1
+    def __init__(self, operation='upd', selected_type=None, selected_type2=None, **kwargs):
+        super().__init__(selected_type, operation, **kwargs)
         self.selected_type2 = selected_type2
-        self.operation = operation
-        self.options = kwargs
 
 
 class TabScaler(Primitive):
     scaler = None
+    supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
         self.scaler = StandardScaler()
@@ -257,6 +271,7 @@ def _transform(self, data, y=None):
 
 class BoxCox(Primitive):
     transformer = None
+    supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
         self.transformer = PowerTransformer()
@@ -271,9 +286,11 @@ def _transform(self, data, y=None):
 
 class Binning(Primitive):
     binner = None
+    strategy = None
+    encoding = None
+    supported_ops = ('add', 'upd')
 
-    def __init__(self, selected_type=None, operation='upd', strategy='quantile', encoding='ordinal'):
-        super().__init__(selected_type, operation)
+    def init_vars(self, strategy='quantile', encoding='ordinal'):
         self.strategy = strategy
         self.encoding = encoding
 
@@ -291,6 +308,7 @@ def _transform(self, data, y=None):
 
 class CatEncoder(Primitive):
     cat_to_int_label = None
+    supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
         X = data.X
@@ -314,6 +332,7 @@ def _transform(self, data, y=None):
 
 class TargetEncoder(Primitive):
     target_encoding_map = None
+    supported_ops = ('add', 'upd')
 
     @staticmethod
     def calc_smooth_mean(df, by, on, alpha=5):
@@ -327,7 +346,7 @@ def calc_smooth_mean(df, by, on, alpha=5):
 
         # Compute the "smoothed" means
         smooth = (counts * means + alpha * mean) / (counts + alpha)
-        return smooth
+        return smooth, mean
 
     def _fit(self, data, y=None):
         X = data.X
@@ -341,12 +360,13 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
-            x_tr[col] = data.X[col].map(self.target_encoding_map[col])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+            x_tr[col] = data.X[col].map(self.target_encoding_map[col][0], self.target_encoding_map[col][1])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.__class__.__name__)
         return data
 
 
 class CatCatEncoder(PrimitiveHigherOrder):
+    supported_ops = ('add', 'upd')
     @staticmethod
     def cat_cat_count(df, col1, col2, strategy='count'):
         if strategy == 'count':
@@ -375,6 +395,8 @@ def _transform(self, data, y=None):
 
 
 class CatNumEncoder(PrimitiveHigherOrder):
+    supported_ops = ('add', 'upd')
+
     def __init__(self, selected_type=None, selected_num=[], operation='add', strategy='mean'):
         super().__init__(selected_type, operation)
         self.selected_num = selected_num
@@ -415,6 +437,8 @@ def _transform(self, data, y=None):
 
 
 class CatBinEncoder(PrimitiveHigherOrder):
+    supported_ops = ('add', 'upd')
+
     @staticmethod
     def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
         if strategy == 'percent_true':
@@ -445,9 +469,10 @@ def _transform(self, data, y=None):
 
 class FilterConstant(Primitive):
     drop_columns = None
+    supported_ops = ('del',)
 
     def _fit(self, data, y=None):
-        X = data.X
+        X = data.X[self.selected]
         self.drop_columns = X.columns[(X.max(axis=0) - X.min(axis=0) == 0)].tolist()
         return self
 
@@ -457,6 +482,7 @@ def _transform(self, data, y=None):
 
 
 class TimeDiff(Primitive):
+    supported_ops = ('add', )
 
     def _fit(self, data, y=None):
         return self
@@ -471,6 +497,7 @@ def _transform(self, data, y=None):
 
 class TimeOffset(Primitive):
     start_time = None
+    supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
         self.start_time = data.X[self.selected].min(axis=0)
@@ -485,6 +512,7 @@ def _transform(self, data, y=None):
 
 class TabPCA(Primitive):
     pca = None
+    supported_ops = ('add', )
 
     def _fit(self, data, y=None):
         self.pca = PCA(n_components=0.99, svd_solver='full')
@@ -500,6 +528,7 @@ def _transform(self, data, y=None):
 
 class CatCount(Primitive):
     count_dict = None
+    supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
         self.count_dict = {}
@@ -517,6 +546,7 @@ def _transform(self, data, y=None):
 
 class LogTransform(Primitive):
     name_key = 'log_'
+    supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
         return self
@@ -531,6 +561,7 @@ def _transform(self, data, y=None):
 
 class Imputation(Primitive):
     impute_dict = None
+    supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
         self.impute_dict = {}
@@ -540,15 +571,18 @@ def _fit(self, data, y=None):
         return self
 
     def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
         for col in self.selected:
-            data.X[col].fillna(self.impute_dict[col])
-        data.update(self.operation, self.selected, None, new_type='NUM')
+            x_tr[col] = data.X[col].fillna(self.impute_dict[col])
+        data.update(self.operation, self.selected, x_tr, new_type=None)
         return data
 
 
 class FeatureFilter(Primitive):
-    def __init__(self, selected_type=None, operation='del', threshold=0.001):
-        super().__init__(selected_type, operation)
+    threshold = None
+    supported_ops = ('del',)
+
+    def init_vars(self, threshold=0.001):
         self.threshold = threshold
         self.drop_columns = []
 
@@ -567,8 +601,11 @@ def _transform(self, data, y=None):
 
 
 class FeatureImportance(Primitive):
-    def __init__(self, selected_type=None, operation='del', threshold=0.001, task_type='classification'):
-        super().__init__(selected_type, operation)
+    threshold = None
+    task_type = 'classification'
+    supported_ops = ('del',)
+
+    def init_vars(self, threshold=0.001, task_type='classification'):
         self.threshold = threshold
         self.drop_columns = []
         self.task_type = task_type
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 50bcfbb..b057257 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -285,6 +285,6 @@ def get_box_plot(data, task_id, metric):
     # ak.run_automation(classification_task_list)
     import time
     t1 = time.time()
-    ak.evaluate(3021)
+    ak.evaluate(2071)
     t2 = time.time()
     print(t2-t1)

From 08f4a4ded758d337eca202ec1175413ab5c08072 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Sun, 15 Sep 2019 21:26:16 -0500
Subject: [PATCH 16/31] Added global config/fixed label encoder

---
 autokaggle/auto_ml.py          |  32 +++++-----
 autokaggle/ensemblers.py       |   1 -
 autokaggle/estimators.py       | 110 +++++++++++++++++++++------------
 autokaggle/preprocessor.py     |  60 +++++++-----------
 examples/benchmarking.py       |   2 +-
 examples/tabular_regression.py |   2 +-
 6 files changed, 106 insertions(+), 101 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 97e0ea3..b726f5c 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -13,20 +13,16 @@
 
 
 class AutoKaggle(BaseEstimator):
-    def __init__(self, estimator=Classifier(), preprocessor=TabularPreprocessor(), path=None, verbose=True):
+    def __init__(self, objective='classification', config=Config()):
         """
         Initialization function for tabular supervised learner.
         """
-        self.verbose = verbose
         self.is_trained = False
-        self.objective = None
-        self.preprocessor = preprocessor
-        self.model = estimator
-        self.path = path if path is not None else rand_temp_folder_generator()
-        ensure_dir(self.path)
-        if self.verbose:
-            print('Path:', path)
-        self.time_limit = None
+        self.config = config
+        self.config.objective = objective
+        self.preprocessor = TabularPreprocessor(config)
+        # TODO find elegant way of specifying classification or regression
+        self.model = Classifier(config) if objective == 'classification' else Regressor(config)
 
     def fit(self, x, y, time_limit=None, data_info=None):
         """
@@ -44,7 +40,7 @@ def fit(self, x, y, time_limit=None, data_info=None):
         you should warm-start your training from the pre-trained model. Past data will
         NOT be available for re-training.
         """
-        self.time_limit = time_limit if time_limit else 24 * 60 * 60
+        self.config.time_limit = time_limit if time_limit else 24 * 60 * 60
         
         if x.shape[1] == 0:
             raise ValueError("No feature exist!")
@@ -58,12 +54,12 @@ def fit(self, x, y, time_limit=None, data_info=None):
         # self.preprocessor = TabularPreprocessor()
             
         # Fit Model and preprocessor
-        self.preprocessor.fit(x, y, self.time_limit, data_info)
+        self.preprocessor.fit(x, y, data_info)
         x = self.preprocessor.transform(x)
         self.model.fit(x, y)
         self.is_trained = True
 
-        if self.verbose:
+        if self.config.verbose:
             print("The whole available data is: ")
             print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1]))
 
@@ -82,15 +78,15 @@ def predict(self, x_test, predict_proba=False):
         return y
 
     def evaluate(self, x_test, y_test):
-        if self.verbose:
-            print('objective:', self.model.objective)
+        if self.config.verbose:
+            print('objective:', self.config.objective)
         y_pred = self.predict(x_test)
         results = None
-        if self.model.objective == 'binary':
+        if self.config.objective == 'binary':
             results = roc_auc_score(y_test, y_pred)
-        elif self.model.objective == 'multiclass':
+        elif self.config.objective == 'multiclass':
             results = f1_score(y_test, y_pred, average='weighted')
-        elif self.model.objective == 'regression':
+        elif self.config.objective == 'regression':
             results = mean_squared_error(y_test, y_pred)
         return results
 
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index 1e5fe88..6f6fd5d 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -1,5 +1,4 @@
 from sklearn.base import BaseEstimator
-from autokaggle.preprocessor import TabularPreprocessor
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 from abc import abstractmethod
 import numpy as np
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 6d34553..45b90b7 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -105,28 +105,54 @@
 }
 
 
-class TabularEstimator(BaseEstimator):
+class Config(BaseEstimator):
     def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=25,
-                 ensemble_strategy='ranked_ensembling', ensemble_method='max_voting'):
-        """
-        Initialization function for tabular supervised learner.
-        """
+                 ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3,
+                 subsample_ratio=0.1):
         self.verbose = verbose
-        self.path = path
+        self.path = path if path is not None else rand_temp_folder_generator()
+        ensure_dir(self.path)
+        if self.verbose:
+            print('Path:', path)
         self.time_limit = time_limit
         self.objective = None
-        abs_cwd = os.path.split(os.path.abspath(__file__))[0]
-        self.best_estimator_ = None
         self.use_ensembling = use_ensembling
         self.hparams = None
         self.num_estimators_ensemble = num_estimators_ensemble
         self.ensemble_strategy = ensemble_strategy
         self.ensemble_method = ensemble_method
-    
+        self.search_iter = search_iter
+        self.cv_folds = cv_folds
+        self.subsample_ratio = subsample_ratio
+        self.resampling_strategy = 'auto'
+        self.random_state = 1001
+        self.feature_add_high_cat = 0
+        self.feature_add_cat_num = 10
+        self.feature_add_cat_cat = 10
+
+
+class TabularEstimator(BaseEstimator):
+    def __init__(self, config=Config(), **kwargs):
+        """
+        Initialization function for tabular supervised learner.
+        """
+        self.config = config
+        # self.verbose = config.verbose
+        # self.path = config.path
+        # self.time_limit = config.time_limit
+        # self.objective = None
+        # abs_cwd = os.path.split(os.path.abspath(__file__))[0]
+        self.best_estimator_ = None
+        # self.use_ensembling = config.use_ensembling
+        self.hparams = None
+        # self.num_estimators_ensemble = config.num_estimators_ensemble
+        # self.ensemble_strategy = config.ensemble_strategy
+        # self.ensemble_method = config.ensemble_method
+
     def fit(self, x, y):
-        if self.objective == 'classification':
+        if self.config.objective == 'classification':
             n_classes = len(set(y))
-            self.objective = 'binary' if n_classes == 2 else 'multiclass'
+            self.config.objective = 'binary' if n_classes == 2 else 'multiclass'
         # x, y = self.resample(x, y)
         self.best_estimator_, _ = self.search(x, y)
         self.best_estimator_.fit(x, y)
@@ -142,10 +168,9 @@ def predict_proba(self, x, y=None):
         except:
             y_pred = self.best_estimator_.predict(x, )
         return y_pred
-    
-    @staticmethod
-    def resample(X, y):
-        return SMOTE(sampling_strategy='auto').fit_resample(X, y)
+
+    def resample(self, X, y):
+        return SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(X, y)
 
     @staticmethod
     def subsample(x, y, sample_percent):
@@ -160,9 +185,9 @@ def subsample(x, y, sample_percent):
         grid_train_x, grid_train_y = x[idx, :], y[idx]
         return grid_train_x, grid_train_y
 
-    def search(self, x, y, search_iter=100, folds=3, sample_percent=0.1):
-        grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=sample_percent)
-        score_metric, skf = self.get_skf(folds)
+    def search(self, x, y):
+        grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio)
+        score_metric, skf = self.get_skf(self.config.cv_folds)
 
         def objective_func(args):
             clf = args['model'](**args['param'])
@@ -170,29 +195,33 @@ def objective_func(args):
                 eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
             except ValueError:
                 eval_score = 0
-            if self.verbose:
+            if self.config.verbose:
                 print("CV Score:", eval_score)
                 print("\n=================")
             return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
 
         trials = Trials()
-        best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, max_evals=search_iter)
-        if self.use_ensembling:
+        best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials,
+                    max_evals=self.config.search_iter)
+        if self.config.use_ensembling:
             best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False)
             estimator_list = []
-            for i in range(self.num_estimators_ensemble):
+            for i in range(self.config.num_estimators_ensemble):
                 model_params = best_trials[i]['space']
                 est = model_params['model'](**model_params['param'])
                 estimator_list.append(est)
-            if self.ensemble_strategy == 'ranked_ensembling':
-                best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.ensemble_method)
-            elif self.ensemble_strategy == 'stacking':
-                best_estimator_ = StackingEnsembler(estimator_list, objective=self.objective)
+            if self.config.ensemble_strategy == 'ranked_ensembling':
+                best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
+            elif self.config.ensemble_strategy == 'stacking':
+                best_estimator_ = StackingEnsembler(estimator_list, objective=self.config.objective)
             else:
-                best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.ensemble_method)
+                best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
         else:
             opt = space_eval(self.hparams, best)
             best_estimator_ = opt['model'](**opt['param'])
+            if self.config.verbose:
+                print("The best hyperparameter setting is:")
+                print(opt)
         return best_estimator_, trials
             
     @abstractmethod
@@ -208,9 +237,9 @@ class Classifier(TabularEstimator):
     """Classifier class.
      It is used for tabular data classification.
     """ 
-    def __init__(self, path=None, verbose=True, time_limit=None):
-        super().__init__(path, verbose, time_limit)
-        self.objective = 'classification'
+    def __init__(self, config=Config(), **kwargs):
+        super().__init__(config, **kwargs)
+        self.config.objective = 'classification'
         # TODO: add choice to the set of estimators
         self.hparams = hp.choice('classifier', [
             {'model': KNeighborsClassifier,
@@ -225,18 +254,18 @@ def __init__(self, path=None, verbose=True, time_limit=None):
             {'model': LGBMClassifier,
              'param': lgbm_classifier_params
              },
-            # {'model': AdaBoostClassifier,
-            #  'param': adaboost_classifier_params
-            #  }
+            {'model': AdaBoostClassifier,
+             'param': adaboost_classifier_params
+             }
         ])
 
     def get_skf(self, folds):
-        if self.objective == 'binary':
+        if self.config.objective == 'binary':
             score_metric = 'roc_auc'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
         else:
             score_metric = 'f1_weighted'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
         return score_metric, skf
     
     
@@ -244,9 +273,9 @@ class Regressor(TabularEstimator):
     """Regressor class.
     It is used for tabular data regression.
     """
-    def __init__(self, path=None, verbose=True, time_limit=None):
-        super().__init__(path, verbose, time_limit)
-        self.objective = 'regression'
+    def __init__(self, config=Config(), **kwargs):
+        super().__init__(config, **kwargs)
+        self.config.objective = 'regression'
         # TODO: add choice to the set of estimators
         self.hparams = hp.choice('regressor', [
             {'model': ExtraTreesRegressor,
@@ -267,5 +296,4 @@ def __init__(self, path=None, verbose=True, time_limit=None):
             ])
 
     def get_skf(self, folds):
-        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001)
-
+        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index c42f7f6..baee70e 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -12,25 +12,20 @@
 from abc import abstractmethod
 import collections
 from lightgbm import LGBMClassifier, LGBMRegressor
+from autokaggle.estimators import Config
 LEVEL_HIGH = 32
 
 
 class TabularPreprocessor:
-    def __init__(self, verbose=True):
+    def __init__(self, config):
         """
         Initialization function for tabular preprocessor.
         """
-        self.verbose = verbose
-
-        self.feature_add_high_cat = 0
-        self.feature_add_cat_num = 10
-        self.feature_add_cat_cat = 10
-
-        self.budget = None
         self.data_info = None
         self.pipeline = None
+        self.config = config
 
-    def fit(self, raw_x, y, time_limit, data_info):
+    def fit(self, raw_x, y, data_info):
         """
         This function should train the model parameters.
 
@@ -43,19 +38,18 @@ def fit(self, raw_x, y, time_limit, data_info):
                      'TIME' for temporal feature, 'NUM' for other numerical feature,
                      and 'CAT' for categorical feature.
         """
-        self.budget = time_limit
         # Extract or read data info
         self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x)
 
-        data = TabularData(raw_x, self.data_info, self.verbose)
+        data = TabularData(raw_x, self.data_info, self.config.verbose)
 
         self.pipeline = Pipeline([
             ('imputer', Imputation(selected_type='ALL', operation='upd')),
             # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')),
             # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')),
             # ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')),
-            ('count_encoder', CatCount(selected_type='CAT', operation='upd')),
-            # ('label_encoder', CatEncoder(selected_type='CAT', operation='add')),
+            # ('count_encoder', CatCount(selected_type='CAT', operation='upd')),
+            ('label_encoder', CatEncoder(selected_type='CAT', operation='upd')),
             # ('boxcox', BoxCox(selected_type='NUM', operation='upd')),
             # ('log_square', LogTransform(selected_type='NUM', operation='upd')),
             ('scaler', TabScaler(selected_type='NUM', operation='upd')),
@@ -71,29 +65,20 @@ def fit(self, raw_x, y, time_limit, data_info):
 
         return self
 
-    def transform(self, raw_x, time_limit=None):
+    def transform(self, raw_x):
         """
         This function should train the model parameters.
 
         Args:
             raw_x: a numpy.ndarray instance containing the training/testing data.
-            time_limit: remaining time budget.
         Both inputs X and y are numpy arrays.
         If fit is called multiple times on incremental data (train, test1, test2, etc.)
         you should warm-start your training from the pre-trained model. Past data will
         NOT be available for re-training.
         """
         # Get Meta-Feature
-        if time_limit is None:
-            if self.budget is None:
-                time_limit = 24 * 60 * 60
-                self.budget = time_limit
-        else:
-            self.budget = time_limit
-
-        data = TabularData(raw_x, self.data_info, self.verbose)
+        data = TabularData(raw_x, self.data_info, self.config.verbose)
         a = self.pipeline.transform(data).X
-        print(a.head())
         return a.values
 
     @staticmethod
@@ -308,25 +293,22 @@ def _transform(self, data, y=None):
 
 class CatEncoder(Primitive):
     cat_to_int_label = None
+    unknown_key_dict = None
     supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
-        X = data.X
         self.cat_to_int_label = {}
-        for col_index in self.selected:
-            self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {})
-            for row_index in range(len(X)):
-                key = str(X[row_index, col_index])
-                if key not in self.cat_to_int_label[col_index]:
-                    self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index])
+        self.unknown_key_dict = {}
+        for col in self.selected:
+            self.cat_to_int_label[col] = {key: idx for idx, key in enumerate(set(data.X[col]))}
+            self.unknown_key_dict[col] = len(self.cat_to_int_label[col])
         return self
 
     def _transform(self, data, y=None):
-        X = data.X
-        for col_index in self.selected:
-            for row_index in range(len(X)):
-                key = str(X[row_index, col_index])
-                X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan)
+        x_tr = pd.DataFrame()
+        for col in self.selected:
+            x_tr[col] = data.X[col].apply(lambda key: self.cat_to_int_label[col].get(key, self.unknown_key_dict[col]))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM')
         return data
 
 
@@ -528,6 +510,7 @@ def _transform(self, data, y=None):
 
 class CatCount(Primitive):
     count_dict = None
+    unknown_key = 0
     supported_ops = ('add', 'upd')
 
     def _fit(self, data, y=None):
@@ -539,7 +522,7 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
-            x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col][key])
+            x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col].get(key, self.unknown_key))
         data.update(self.operation, self.selected, x_tr, new_type='NUM')
         return data
 
@@ -663,5 +646,4 @@ def _transform(self, data, y=None):
     x_new = prep.transform(x_train)
 
     print("-----")
-    print(x_new[:4, 2])
-
+    print(x_new[:4, 2])
\ No newline at end of file
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index b057257..60ceffa 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -285,6 +285,6 @@ def get_box_plot(data, task_id, metric):
     # ak.run_automation(classification_task_list)
     import time
     t1 = time.time()
-    ak.evaluate(2071)
+    ak.evaluate(31)
     t2 = time.time()
     print(t2-t1)
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index c338dd2..3b49846 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -18,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggle(Regressor())
+    clf = AutoKaggle('regression')
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 

From cdd2ba7e65b301f649b20ff35046e56f9f84bb65 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Tue, 17 Sep 2019 11:01:03 -0500
Subject: [PATCH 17/31] Split to classifier and regressor

---
 autokaggle/__init__.py                        |  2 +-
 autokaggle/auto_ml.py                         | 26 +++++--
 autokaggle/ensemblers.py                      | 12 ++--
 autokaggle/estimators.py                      | 51 ++++++++-----
 autokaggle/preprocessor.py                    | 71 ++++++++++++-------
 examples/benchmarking.py                      | 18 +++--
 examples/tabular_classification_binary.py     |  2 +-
 examples/tabular_classification_multiclass.py |  2 +-
 examples/tabular_regression.py                |  2 +-
 9 files changed, 125 insertions(+), 61 deletions(-)

diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py
index 0245d5b..1a975c9 100644
--- a/autokaggle/__init__.py
+++ b/autokaggle/__init__.py
@@ -1,3 +1,3 @@
-from autokaggle.auto_ml import AutoKaggle
+from autokaggle.auto_ml import AutoKaggleClassifier, AutoKaggleRegressor
 from autokaggle.estimators import *
 from autokaggle.ensemblers import *
diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index b726f5c..c2fc7c5 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -13,16 +13,18 @@
 
 
 class AutoKaggle(BaseEstimator):
-    def __init__(self, objective='classification', config=Config()):
+    objective = None
+    model = None
+
+    def __init__(self, config=Config(), **kwargs):
         """
         Initialization function for tabular supervised learner.
         """
         self.is_trained = False
         self.config = config
-        self.config.objective = objective
+        self.config.update(kwargs)
+        self.config.objective = self.objective
         self.preprocessor = TabularPreprocessor(config)
-        # TODO find elegant way of specifying classification or regression
-        self.model = Classifier(config) if objective == 'classification' else Regressor(config)
 
     def fit(self, x, y, time_limit=None, data_info=None):
         """
@@ -93,3 +95,19 @@ def evaluate(self, x_test, y_test):
     def final_fit(self, x_train, y_train):
         x_train = self.preprocessor.transform(x_train)
         self.model.fit(x_train, y_train)
+
+
+class AutoKaggleClassifier(AutoKaggle):
+    objective = 'classification'
+
+    def __init__(self, config=Config(), **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = Classifier(config)
+
+
+class AutoKaggleRegressor(AutoKaggle):
+    objective = 'regression'
+
+    def __init__(self, config=Config(), **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = Regressor(config)
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index 6f6fd5d..507fa6e 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -49,14 +49,14 @@ def __init__(self, estimator_list, objective):
         self.objective = objective
         if self.objective == 'regression':
             self.stacking_estimator = LGBMRegressor(silent=False,
-                                           verbose=-1,
-                                           n_jobs=1,
-                                           objective=self.objective)
+                                                    verbose=-1,
+                                                    n_jobs=1,
+                                                    objective=self.objective)
         elif self.objective == 'multiclass' or self.objective == 'binary':
             self.stacking_estimator = LGBMClassifier(silent=False,
-                                            verbose=-1,
-                                            n_jobs=1,
-                                            objective=self.objective)
+                                                     verbose=-1,
+                                                     n_jobs=1,
+                                                     objective=self.objective)
 
     def fit(self, X, y):
         for est in self.estimator_list:
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 45b90b7..7a1d4a5 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -106,9 +106,9 @@
 
 
 class Config(BaseEstimator):
-    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=25,
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
                  ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3,
-                 subsample_ratio=0.1):
+                 subsample_ratio=0.1, random_ensemble=False):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
         ensure_dir(self.path)
@@ -119,16 +119,20 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=Fals
         self.use_ensembling = use_ensembling
         self.hparams = None
         self.num_estimators_ensemble = num_estimators_ensemble
+        # self.ensemble_strategy = ensemble_strategy
         self.ensemble_strategy = ensemble_strategy
         self.ensemble_method = ensemble_method
+        self.random_ensemble = random_ensemble
         self.search_iter = search_iter
         self.cv_folds = cv_folds
         self.subsample_ratio = subsample_ratio
         self.resampling_strategy = 'auto'
         self.random_state = 1001
-        self.feature_add_high_cat = 0
-        self.feature_add_cat_num = 10
-        self.feature_add_cat_cat = 10
+
+    def update(self, options):
+        for k, v in options.items():
+            if hasattr(self, k):
+                setattr(self, k, v)
 
 
 class TabularEstimator(BaseEstimator):
@@ -203,24 +207,14 @@ def objective_func(args):
         trials = Trials()
         best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials,
                     max_evals=self.config.search_iter)
+
         if self.config.use_ensembling:
-            best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False)
-            estimator_list = []
-            for i in range(self.config.num_estimators_ensemble):
-                model_params = best_trials[i]['space']
-                est = model_params['model'](**model_params['param'])
-                estimator_list.append(est)
-            if self.config.ensemble_strategy == 'ranked_ensembling':
-                best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
-            elif self.config.ensemble_strategy == 'stacking':
-                best_estimator_ = StackingEnsembler(estimator_list, objective=self.config.objective)
-            else:
-                best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
+            best_estimator_ = self.setup_ensemble(trials)
         else:
             opt = space_eval(self.hparams, best)
             best_estimator_ = opt['model'](**opt['param'])
             if self.config.verbose:
-                print("The best hyperparameter setting is:")
+                print("The best hyperparameter setting found:")
                 print(opt)
         return best_estimator_, trials
             
@@ -231,6 +225,27 @@ def save_model(self):
     @abstractmethod
     def get_skf(self, folds):
         pass
+
+    def setup_ensemble(self, trials):
+        best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False)
+        # Filter the unsuccessful hparam spaces i.e. 'loss' == 1
+        best_trials = [t for t in best_trials if t['loss'] < 1]
+        self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials))
+        if self.config.random_ensemble:
+            np.random.shuffle(best_trials)
+        estimator_list = []
+        for i in range(self.config.num_estimators_ensemble):
+            model_params = best_trials[i]['space']
+            est = model_params['model'](**model_params['param'])
+            estimator_list.append(est)
+
+        if self.config.ensemble_strategy == 'ranked_ensembling':
+            best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
+        elif self.config.ensemble_strategy == 'stacking':
+            best_estimator_ = StackingEnsembler(estimator_list, objective=self.config.objective)
+        else:
+            best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
+        return best_estimator_
     
     
 class Classifier(TabularEstimator):
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index baee70e..6bc0013 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -4,7 +4,7 @@
 import itertools
 from scipy.stats import pearsonr
 from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer
+from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer, OneHotEncoder
 from sklearn.base import TransformerMixin
 from sklearn.base import BaseEstimator
 from sklearn.impute import SimpleImputer
@@ -47,12 +47,13 @@ def fit(self, raw_x, y, data_info):
             ('imputer', Imputation(selected_type='ALL', operation='upd')),
             # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')),
             # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')),
-            # ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')),
+            ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')),
             # ('count_encoder', CatCount(selected_type='CAT', operation='upd')),
-            ('label_encoder', CatEncoder(selected_type='CAT', operation='upd')),
-            # ('boxcox', BoxCox(selected_type='NUM', operation='upd')),
-            # ('log_square', LogTransform(selected_type='NUM', operation='upd')),
+            # ('one_hot_encoder', OneHot(selected_type='CAT', operation='upd')),
+            # ('label_encoder', LabelEncode(selected_type='CAT', operation='upd')),
             ('scaler', TabScaler(selected_type='NUM', operation='upd')),
+            # ('boxcox', BoxCox(selected_type='NUM', operation='upd')),
+            # ('log_transform', LogTransform(selected_type='NUM', operation='upd')),
             # ('binning', Binning(selected_type='NUM', operation='upd')),
             # ('pca', TabPCA(selected_type='NUM', operation='add')),
             # ('time_diff', TimeDiff(selected_type='TIME', operation='add')),
@@ -142,6 +143,7 @@ def delete_type(self, columns):
 
     def rename_cols(self, key):
         def rename_fn(col_name):
+            col_name = str(col_name)
             col_name += '_' + key
             while col_name in self.X.columns:
                 col_name += '_' + key
@@ -201,11 +203,13 @@ class Primitive(BaseEstimator, TransformerMixin):
     drop_columns = None
     options = None
     supported_ops = ('add', 'upd', 'del')
+    name_key = ''
 
     def __init__(self, selected_type=None, operation='upd', **kwargs):
         self.selected_type = selected_type
         self.operation = operation
         self.init_vars(**kwargs)
+        self.name_key = self.__class__.__name__
 
     def init_vars(self, **kwargs):
         self.options = kwargs
@@ -250,7 +254,7 @@ def _fit(self, data, y=None):
 
     def _transform(self, data, y=None):
         x_tr = self.scaler.transform(data.X[self.selected])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -265,7 +269,7 @@ def _fit(self, data, y=None):
 
     def _transform(self, data, y=None):
         x_tr = self.transformer.transform(data.X[self.selected])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -287,11 +291,30 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = self.binner.transform(data.X[self.selected])
         # TODO: decide if cat or num new type
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+        return data
+
+
+class OneHot(Primitive):
+    ohe = None
+    supported_ops = ('add', 'upd')
+
+    def _fit(self, data, y=None):
+        self.ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
+        self.ohe.fit(data.X[self.selected], y)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame(self.ohe.transform(data.X[self.selected]))
+        if self.operation == 'add':
+            data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+        elif self.operation == 'upd':
+            data.update('add', self.selected, x_tr, new_type='NUM', key=self.name_key)
+            data.update('del', self.selected, None, None, key=self.name_key)
         return data
 
 
-class CatEncoder(Primitive):
+class LabelEncode(Primitive):
     cat_to_int_label = None
     unknown_key_dict = None
     supported_ops = ('add', 'upd')
@@ -308,7 +331,7 @@ def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
             x_tr[col] = data.X[col].apply(lambda key: self.cat_to_int_label[col].get(key, self.unknown_key_dict[col]))
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -343,7 +366,7 @@ def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
             x_tr[col] = data.X[col].map(self.target_encoding_map[col][0], self.target_encoding_map[col][1])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.__class__.__name__)
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -372,7 +395,7 @@ def _transform(self, data, y=None):
             if col1 + '_cross_' + col2 in self.cat_cat_map:
                 x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2])
         # TODO: decide new_type
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -414,7 +437,7 @@ def _transform(self, data, y=None):
             for col2 in self.selected_num:
                 if col1 + '_cross_' + col2 in self.cat_num_map:
                     x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_num_map[col1 + '_cross_' + col2])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -445,7 +468,7 @@ def _transform(self, data, y=None):
             for col2 in self.selected_bin:
                 if col1 + '_cross_' + col2 in self.cat_bin_map:
                     x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -459,7 +482,7 @@ def _fit(self, data, y=None):
         return self
 
     def _transform(self, data, y=None):
-        data.update(self.operation, self.drop_columns, None, new_type=None)
+        data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key)
         return data
 
 
@@ -473,7 +496,7 @@ def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for a, b in itertools.combinations(self.selected, 2):
             x_tr[a + '-' + b] = data.X[a] - data.X[b]
-        data.update(self.operation, self.selected, x_tr, new_type='TIME')
+        data.update(self.operation, self.selected, x_tr, new_type='TIME', key=self.name_key)
         return data
 
 
@@ -488,7 +511,7 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         x_tr[self.selected] = data.X[self.selected] - self.start_time
-        data.update(self.operation, self.selected, x_tr, new_type='TIME')
+        data.update(self.operation, self.selected, x_tr, new_type='TIME', key=self.name_key)
         return data
 
 
@@ -504,7 +527,7 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_pca = self.pca.transform(data.X[self.selected])
         x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in range(x_pca.shape[1])])
-        data.update(self.operation, self.selected, x_pca, new_type='NUM')
+        data.update(self.operation, self.selected, x_pca, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -523,7 +546,7 @@ def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
             x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col].get(key, self.unknown_key))
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -537,8 +560,8 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
-            x_tr[self.name_key + col] = np.square(np.log(1 + data.X[col]))
-        data.update(self.operation, self.selected, x_tr, new_type='NUM')
+            x_tr[self.name_key + col] =  np.sign(data.X[col]) * np.log(1 + np.abs(data.X[col]))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
@@ -557,7 +580,7 @@ def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
             x_tr[col] = data.X[col].fillna(self.impute_dict[col])
-        data.update(self.operation, self.selected, x_tr, new_type=None)
+        data.update(self.operation, self.selected, x_tr, new_type=None, key=self.name_key)
         return data
 
 
@@ -579,7 +602,7 @@ def _fit(self, data, y=None):
         return self
 
     def _transform(self, data, y=None):
-        data.update(self.operation, self.drop_columns, None, new_type=None)
+        data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key)
         return data
 
 
@@ -620,7 +643,7 @@ def _fit(self, data, y=None):
         return self
 
     def _transform(self, data, y=None):
-        data.update(self.operation, self.drop_columns, None, new_type=None)
+        data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key)
         return data
 
 
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 60ceffa..c1f8a09 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -120,9 +120,9 @@ def evaluate(self, task_id, time_limit=10*60):
 
         # Train
         if task.task_type == 'Supervised Classification':
-            automl = AutoKaggle() if not self.estimator_type else AutoKaggle(self.estimator_type)
+            automl = AutoKaggleClassifier()
         elif task.task_type == 'Supervised Regression':
-            automl = AutoKaggle(Regressor()) if not self.estimator_type else AutoKaggle(self.estimator_type)
+            automl = AutoKaggleRegressor()
         else:
             print("UNSUPPORTED TASK_TYPE")
             assert(0)
@@ -279,12 +279,20 @@ def get_box_plot(data, task_id, metric):
 
 if __name__ == "__main__":
     regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839]
+    # classification_task_list = [3021]
     classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101,
                                 9914, 3020, 3524, 3573, 3962]
-    ak = BenchmarkingAutoKaggle()
-    # ak.run_automation(classification_task_list)
+    ak = BenchmarkingAutoKaggle(sess_name='test_perf')
     import time
+
     t1 = time.time()
-    ak.evaluate(31)
+    for _ in range(3):
+        ak.run_automation(classification_task_list)
+        ak.export_results()
     t2 = time.time()
     print(t2-t1)
+    # import time
+    # t1 = time.time()
+    # ak.evaluate(31)
+    # t2 = time.time()
+    # print(t2-t1)
diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py
index 0ebd91f..fa8638d 100644
--- a/examples/tabular_classification_binary.py
+++ b/examples/tabular_classification_binary.py
@@ -19,7 +19,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggle()
+    clf = AutoKaggleClassifier()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py
index 0da02fe..109d361 100644
--- a/examples/tabular_classification_multiclass.py
+++ b/examples/tabular_classification_multiclass.py
@@ -18,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggle()
+    clf = AutoKaggleClassifier()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index 3b49846..885160c 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -18,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggle('regression')
+    clf = AutoKaggleRegressor('regression')
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 

From 516515b85cd23c3f642dad1ff9fa48276ff74e20 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Tue, 17 Sep 2019 12:56:32 -0500
Subject: [PATCH 18/31] Refactor Config

---
 autokaggle/auto_ml.py    |   9 ++-
 autokaggle/config.py     | 169 +++++++++++++++++++++++++++++++++++++++
 autokaggle/estimators.py | 165 +-------------------------------------
 examples/benchmarking.py |   6 +-
 4 files changed, 181 insertions(+), 168 deletions(-)
 create mode 100644 autokaggle/config.py

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index c2fc7c5..60bb7a1 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -12,9 +12,11 @@
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 
 
+# TODO: Further clean the design of this file
 class AutoKaggle(BaseEstimator):
     objective = None
     model = None
+    preprocessor = None
 
     def __init__(self, config=Config(), **kwargs):
         """
@@ -24,6 +26,9 @@ def __init__(self, config=Config(), **kwargs):
         self.config = config
         self.config.update(kwargs)
         self.config.objective = self.objective
+        if not self.config.path:
+            self.config.path = rand_temp_folder_generator()
+            # abs_cwd = os.path.split(os.path.abspath(__file__))[0]
         self.preprocessor = TabularPreprocessor(config)
 
     def fit(self, x, y, time_limit=None, data_info=None):
@@ -50,10 +55,6 @@ def fit(self, x, y, time_limit=None, data_info=None):
         while x.shape[0] < 60:
             x = np.concatenate([x, x], axis=0)
             y = np.concatenate([y, y], axis=0)
-        
-        # # Init model and preprocessor
-        # self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit)
-        # self.preprocessor = TabularPreprocessor()
             
         # Fit Model and preprocessor
         self.preprocessor.fit(x, y, data_info)
diff --git a/autokaggle/config.py b/autokaggle/config.py
new file mode 100644
index 0000000..edbc65c
--- /dev/null
+++ b/autokaggle/config.py
@@ -0,0 +1,169 @@
+from sklearn.base import BaseEstimator
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir
+from hyperopt import hp
+from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor,\
+    ExtraTreesRegressor
+from sklearn.linear_model import Ridge
+from lightgbm import LGBMClassifier, LGBMRegressor
+
+
+class Config(BaseEstimator):
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
+                 ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3,
+                 subsample_ratio=0.1, random_ensemble=False):
+        self.verbose = verbose
+        self.path = path if path is not None else rand_temp_folder_generator()
+        ensure_dir(self.path)
+        if self.verbose:
+            print('Path:', self.path)
+        self.time_limit = time_limit
+        self.objective = None
+        self.use_ensembling = use_ensembling
+        self.hparams = None
+        self.num_estimators_ensemble = num_estimators_ensemble
+        self.ensemble_strategy = ensemble_strategy
+        self.ensemble_method = ensemble_method
+        self.random_ensemble = random_ensemble
+        self.search_iter = search_iter
+        self.cv_folds = cv_folds
+        self.subsample_ratio = subsample_ratio
+        self.resampling_strategy = 'auto'
+        self.random_state = 1001
+        self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest', 'adaboost']
+        self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost']
+
+    def update(self, options):
+        for k, v in options.items():
+            if hasattr(self, k):
+                setattr(self, k, v)
+
+
+knn_classifier_params = {
+    'n_neighbors': hp.choice('n_neighbors', range(2, 20)),
+    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
+    'leaf_size': hp.choice('leaf_size', range(5, 50)),
+    'metric': hp.choice('metric', ["euclidean", "manhattan", "chebyshev", "minkowski"]),
+    'p': hp.choice('p', range(1, 4)),
+}
+
+svc_params = {
+    'C': hp.lognormal('C', 0, 1),
+    'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
+    'degree': hp.choice('degree', range(1, 6)),
+    'gamma': hp.uniform('gamma', 0.001, 10000),
+    'max_iter': 50000,
+}
+
+random_forest_classifier_params = {
+    'criterion': hp.choice('criterion', ['entropy', 'gini']),
+    'max_features': hp.uniform('max_features', 0, 1.0),
+    'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
+    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
+}
+
+lgbm_classifier_params = {
+    'boosting_type': 'gbdt',
+    'min_split_gain': 0.1,
+    'subsample': 0.8,
+    'num_leaves': 80,
+    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
+    'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
+    'max_depth': hp.choice('max_depth', range(5, 10)),
+    'n_estimators': hp.choice('n_estimators', range(50, 200)),
+    'learning_rate': hp.lognormal('learning_rate', 0, 1),
+}
+
+adaboost_classifier_params = {
+    'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
+    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
+    'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
+}
+
+extra_trees_regressor_params = {
+    'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)),
+    'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']),
+    'max_features': hp.uniform('max_features_extra_trees', 0, 1.0),
+    'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)),
+    'min_impurity_decrease': 0.0
+}
+
+ridge_params = {
+    'fit_intercept': True,
+    'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1),
+    'alpha': hp.loguniform('alpha_ridge', 1e-5, 10)
+}
+
+random_forest_regressor_params = {
+    'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']),
+    'max_features': hp.uniform('max_features', 0, 1.0),
+    'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
+    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
+}
+
+lgbm_regressor_params = {
+    'boosting_type': 'gbdt',
+    'min_split_gain': 0.1,
+    'subsample': 0.8,
+    'num_leaves': 80,
+    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
+    'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
+    'max_depth': hp.choice('max_depth', range(5, 10)),
+    'n_estimators': hp.choice('n_estimators', range(50, 200)),
+    'learning_rate': hp.lognormal('learning_rate', 0, 1),
+}
+
+adaboost_regressor_params = {
+    'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
+    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
+    'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
+}
+
+
+regression_hspace = {
+    'extratree': {
+        'model': ExtraTreesRegressor,
+        'param': extra_trees_regressor_params
+    },
+    'ridge': {
+        'model': Ridge,
+        'param': ridge_params
+    },
+    'random_forest': {
+        'model': RandomForestRegressor,
+        'param': random_forest_regressor_params
+    },
+    'lgbm': {
+        'model': LGBMRegressor,
+        'param': lgbm_regressor_params
+    },
+    'adaboost': {
+        'model': AdaBoostRegressor,
+        'param': adaboost_regressor_params
+     }
+}
+
+
+classification_hspace = {
+    'knn': {
+        'model': KNeighborsClassifier,
+        'param': knn_classifier_params
+    },
+    'svm': {
+        'model': SVC,
+        'param': svc_params
+    },
+    'random_forest': {
+        'model': RandomForestClassifier,
+        'param': random_forest_classifier_params
+    },
+    'lgbm': {
+        'model': LGBMClassifier,
+        'param': lgbm_classifier_params
+    },
+    'adaboost': {
+        'model': AdaBoostClassifier,
+        'param': adaboost_classifier_params
+    }
+}
\ No newline at end of file
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 7a1d4a5..8cc3fef 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -5,153 +5,28 @@
 import random
 import json
 
-from lightgbm import LGBMClassifier, LGBMRegressor
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score
 from sklearn.model_selection import StratifiedKFold, KFold
-from sklearn.svm import SVC
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor,\
-    ExtraTreesRegressor
-from sklearn.linear_model import Ridge
+
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer
 from joblib import dump, load
 
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler
+from autokaggle.config import Config, classification_hspace, regression_hspace
 import hyperopt
 from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
 from imblearn.over_sampling import SMOTE, SMOTENC
 
 
-# TODO: Way to change the default hparams
-knn_classifier_params = {
-    'n_neighbors': hp.choice('n_neighbors', range(2, 20)),
-    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
-    'leaf_size': hp.choice('leaf_size', range(5, 50)),
-    'metric': hp.choice('metric', ["euclidean", "manhattan", "chebyshev", "minkowski"]),
-    'p': hp.choice('p', range(1, 4)),
-}
-
-svc_params = {
-    'C': hp.lognormal('C', 0, 1),
-    'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
-    'degree': hp.choice('degree', range(1, 6)),
-    'gamma': hp.uniform('gamma', 0.001, 10000),
-    'max_iter': 50000,
-}
-
-random_forest_classifier_params = {
-    'criterion': hp.choice('criterion', ['entropy', 'gini']),
-    'max_features': hp.uniform('max_features', 0, 1.0),
-    'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
-    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
-}
-
-lgbm_classifier_params = {
-    'boosting_type': 'gbdt',
-    'min_split_gain': 0.1,
-    'subsample': 0.8,
-    'num_leaves': 80,
-    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
-    'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
-    'max_depth': hp.choice('max_depth', range(5, 10)),
-    'n_estimators': hp.choice('n_estimators', range(50, 200)),
-    'learning_rate': hp.lognormal('learning_rate', 0, 1),
-}
-
-adaboost_classifier_params = {
-    'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
-    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
-    'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
-}
-
-extra_trees_regressor_params = {
-    'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)),
-    'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']),
-    'max_features': hp.uniform('max_features_extra_trees', 0, 1.0),
-    'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)),
-    'min_impurity_decrease': 0.0
-}
-
-ridge_params = {
-    'fit_intercept': True,
-    'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1),
-    'alpha': hp.loguniform('alpha_ridge', 1e-5, 10)
-}
-
-random_forest_regressor_params = {
-    'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']),
-    'max_features': hp.uniform('max_features', 0, 1.0),
-    'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
-    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
-}
-
-lgbm_regressor_params = {
-    'boosting_type': 'gbdt',
-    'min_split_gain': 0.1,
-    'subsample': 0.8,
-    'num_leaves': 80,
-    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
-    'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
-    'max_depth': hp.choice('max_depth', range(5, 10)),
-    'n_estimators': hp.choice('n_estimators', range(50, 200)),
-    'learning_rate': hp.lognormal('learning_rate', 0, 1),
-}
-
-adaboost_regressor_params = {
-    'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
-    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
-    'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
-}
-
-
-class Config(BaseEstimator):
-    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
-                 ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3,
-                 subsample_ratio=0.1, random_ensemble=False):
-        self.verbose = verbose
-        self.path = path if path is not None else rand_temp_folder_generator()
-        ensure_dir(self.path)
-        if self.verbose:
-            print('Path:', path)
-        self.time_limit = time_limit
-        self.objective = None
-        self.use_ensembling = use_ensembling
-        self.hparams = None
-        self.num_estimators_ensemble = num_estimators_ensemble
-        # self.ensemble_strategy = ensemble_strategy
-        self.ensemble_strategy = ensemble_strategy
-        self.ensemble_method = ensemble_method
-        self.random_ensemble = random_ensemble
-        self.search_iter = search_iter
-        self.cv_folds = cv_folds
-        self.subsample_ratio = subsample_ratio
-        self.resampling_strategy = 'auto'
-        self.random_state = 1001
-
-    def update(self, options):
-        for k, v in options.items():
-            if hasattr(self, k):
-                setattr(self, k, v)
-
-
 class TabularEstimator(BaseEstimator):
     def __init__(self, config=Config(), **kwargs):
         """
         Initialization function for tabular supervised learner.
         """
         self.config = config
-        # self.verbose = config.verbose
-        # self.path = config.path
-        # self.time_limit = config.time_limit
-        # self.objective = None
-        # abs_cwd = os.path.split(os.path.abspath(__file__))[0]
         self.best_estimator_ = None
-        # self.use_ensembling = config.use_ensembling
         self.hparams = None
-        # self.num_estimators_ensemble = config.num_estimators_ensemble
-        # self.ensemble_strategy = config.ensemble_strategy
-        # self.ensemble_method = config.ensemble_method
 
     def fit(self, x, y):
         if self.config.objective == 'classification':
@@ -256,23 +131,7 @@ def __init__(self, config=Config(), **kwargs):
         super().__init__(config, **kwargs)
         self.config.objective = 'classification'
         # TODO: add choice to the set of estimators
-        self.hparams = hp.choice('classifier', [
-            {'model': KNeighborsClassifier,
-             'param': knn_classifier_params
-             },
-            {'model': SVC,
-             'param': svc_params
-             },
-            {'model': RandomForestClassifier,
-             'param': random_forest_classifier_params
-             },
-            {'model': LGBMClassifier,
-             'param': lgbm_classifier_params
-             },
-            {'model': AdaBoostClassifier,
-             'param': adaboost_classifier_params
-             }
-        ])
+        self.hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models])
 
     def get_skf(self, folds):
         if self.config.objective == 'binary':
@@ -292,23 +151,7 @@ def __init__(self, config=Config(), **kwargs):
         super().__init__(config, **kwargs)
         self.config.objective = 'regression'
         # TODO: add choice to the set of estimators
-        self.hparams = hp.choice('regressor', [
-            {'model': ExtraTreesRegressor,
-             'param': extra_trees_regressor_params
-             },
-            {'model': Ridge,
-             'param': ridge_params
-             },
-            {'model': RandomForestRegressor,
-             'param': random_forest_regressor_params
-             },
-            {'model': LGBMRegressor,
-             'param': lgbm_regressor_params
-             },
-            {'model': AdaBoostRegressor,
-             'param': adaboost_regressor_params
-             }
-            ])
+        self.hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models])
 
     def get_skf(self, folds):
         return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index c1f8a09..dec5f93 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -279,9 +279,9 @@ def get_box_plot(data, task_id, metric):
 
 if __name__ == "__main__":
     regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839]
-    # classification_task_list = [3021]
-    classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101,
-                                9914, 3020, 3524, 3573, 3962]
+    classification_task_list = [3021]
+    # classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101,
+    #                             9914, 3020, 3524, 3573, 3962]
     ak = BenchmarkingAutoKaggle(sess_name='test_perf')
     import time
 

From c400a2f58f3e44b1324e51eed504b586d743d5ea Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Tue, 17 Sep 2019 15:55:16 -0500
Subject: [PATCH 19/31] Fix config init related bug

---
 autokaggle/auto_ml.py    | 14 +++++++-------
 autokaggle/config.py     |  2 +-
 autokaggle/estimators.py | 14 ++++++++------
 examples/benchmarking.py |  1 +
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 60bb7a1..9fc82c2 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -18,18 +18,18 @@ class AutoKaggle(BaseEstimator):
     model = None
     preprocessor = None
 
-    def __init__(self, config=Config(), **kwargs):
+    def __init__(self, config=None, **kwargs):
         """
         Initialization function for tabular supervised learner.
         """
         self.is_trained = False
-        self.config = config
+        self.config = config if config else Config()
         self.config.update(kwargs)
         self.config.objective = self.objective
         if not self.config.path:
             self.config.path = rand_temp_folder_generator()
             # abs_cwd = os.path.split(os.path.abspath(__file__))[0]
-        self.preprocessor = TabularPreprocessor(config)
+        self.preprocessor = TabularPreprocessor(self.config)
 
     def fit(self, x, y, time_limit=None, data_info=None):
         """
@@ -101,14 +101,14 @@ def final_fit(self, x_train, y_train):
 class AutoKaggleClassifier(AutoKaggle):
     objective = 'classification'
 
-    def __init__(self, config=Config(), **kwargs):
+    def __init__(self, config=None, **kwargs):
         super().__init__(config, **kwargs)
-        self.model = Classifier(config)
+        self.model = Classifier(self.config)
 
 
 class AutoKaggleRegressor(AutoKaggle):
     objective = 'regression'
 
-    def __init__(self, config=Config(), **kwargs):
+    def __init__(self, config=None, **kwargs):
         super().__init__(config, **kwargs)
-        self.model = Regressor(config)
+        self.model = Regressor(self.config)
diff --git a/autokaggle/config.py b/autokaggle/config.py
index edbc65c..e4d87aa 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -11,7 +11,7 @@
 
 class Config(BaseEstimator):
     def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
-                 ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3,
+                 ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=5, cv_folds=3,
                  subsample_ratio=0.1, random_ensemble=False):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 8cc3fef..214940b 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -20,11 +20,11 @@
 
 
 class TabularEstimator(BaseEstimator):
-    def __init__(self, config=Config(), **kwargs):
+    def __init__(self, config=None, **kwargs):
         """
         Initialization function for tabular supervised learner.
         """
-        self.config = config
+        self.config = config if config else Config()
         self.best_estimator_ = None
         self.hparams = None
 
@@ -68,7 +68,8 @@ def search(self, x, y):
         grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio)
         score_metric, skf = self.get_skf(self.config.cv_folds)
 
-        def objective_func(args):
+        def objective_func(params):
+            args = params['estimator']
             clf = args['model'](**args['param'])
             try:
                 eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
@@ -80,7 +81,8 @@ def objective_func(args):
             return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
 
         trials = Trials()
-        best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials,
+        search_space = {'source': hp.choice('data_source', ['a', 'b']), 'estimator': self.hparams}
+        best = fmin(objective_func, search_space, algo=hyperopt.rand.suggest, trials=trials,
                     max_evals=self.config.search_iter)
 
         if self.config.use_ensembling:
@@ -127,7 +129,7 @@ class Classifier(TabularEstimator):
     """Classifier class.
      It is used for tabular data classification.
     """ 
-    def __init__(self, config=Config(), **kwargs):
+    def __init__(self, config=None, **kwargs):
         super().__init__(config, **kwargs)
         self.config.objective = 'classification'
         # TODO: add choice to the set of estimators
@@ -147,7 +149,7 @@ class Regressor(TabularEstimator):
     """Regressor class.
     It is used for tabular data regression.
     """
-    def __init__(self, config=Config(), **kwargs):
+    def __init__(self, config=None, **kwargs):
         super().__init__(config, **kwargs)
         self.config.objective = 'regression'
         # TODO: add choice to the set of estimators
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index dec5f93..750cbbb 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -63,6 +63,7 @@ def run_automation(self, task_list, time_limit=10*60):
         for task in task_list:
             try:
                 self.evaluate(task, time_limit=time_limit)
+                self.export_results()
             except:
                 print("task: {} didnt work".format(task))
                 

From b35d9be2046ae807cfef52b081d74c3d7bc2ee21 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Wed, 18 Sep 2019 11:32:01 -0500
Subject: [PATCH 20/31] Diverse Ensembles

---
 autokaggle/config.py       | 52 ++++++++++++++++++++++++++++++--------
 autokaggle/estimators.py   | 40 ++++++++++++++++++++++++-----
 autokaggle/preprocessor.py |  3 ++-
 examples/benchmarking.py   |  9 +++----
 4 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/autokaggle/config.py b/autokaggle/config.py
index e4d87aa..9b48062 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -7,12 +7,14 @@
     ExtraTreesRegressor
 from sklearn.linear_model import Ridge
 from lightgbm import LGBMClassifier, LGBMRegressor
+from catboost import CatBoostClassifier, Pool, CatBoostRegressor
+import numpy as np
 
 
 class Config(BaseEstimator):
     def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
-                 ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=5, cv_folds=3,
-                 subsample_ratio=0.1, random_ensemble=False):
+                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3,
+                 subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
         ensure_dir(self.path)
@@ -32,7 +34,9 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True
         self.resampling_strategy = 'auto'
         self.random_state = 1001
         self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest', 'adaboost']
-        self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost']
+        # self.classification_models = ['knn', 'lgbm', 'random_forest',]
+        self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost', 'catboost']
+        self.diverse_ensemble = diverse_ensemble
 
     def update(self, options):
         for k, v in options.items():
@@ -49,9 +53,9 @@ def update(self, options):
 }
 
 svc_params = {
-    'C': hp.lognormal('C', 0, 1),
+    'C': hp.loguniform('C', np.log(1e-5), np.log(1e5)),
     'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
-    'degree': hp.choice('degree', range(1, 6)),
+    'degree': hp.choice('degree', range(1, 7)),
     'gamma': hp.uniform('gamma', 0.001, 10000),
     'max_iter': 50000,
 }
@@ -72,13 +76,23 @@ def update(self, options):
     'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
     'max_depth': hp.choice('max_depth', range(5, 10)),
     'n_estimators': hp.choice('n_estimators', range(50, 200)),
-    'learning_rate': hp.lognormal('learning_rate', 0, 1),
+    'learning_rate': hp.loguniform('learning_rate', low=np.log(1e-5), high=np.log(1)),
 }
 
 adaboost_classifier_params = {
     'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
     'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
-    'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
+    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-5), high=np.log(1)),
+}
+
+catboost_classifier_params = {
+    'iterations': hp.choice('catboost_iterations', [5, 10]),
+    'depth': hp.choice('depth_catboost', range(4, 11)),
+    'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), high=np.log(1)),
+    'loss_function': hp.choice('loss_function_catboost', ['Logloss', 'CrossEntropy']),
+    'verbose': True,
+    'leaf_estimation_iterations': 10,
+    'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3))
 }
 
 extra_trees_regressor_params = {
@@ -92,7 +106,7 @@ def update(self, options):
 ridge_params = {
     'fit_intercept': True,
     'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1),
-    'alpha': hp.loguniform('alpha_ridge', 1e-5, 10)
+    'alpha': hp.loguniform('alpha_ridge', np.log(1e-5), np.log(10))
 }
 
 random_forest_regressor_params = {
@@ -111,13 +125,21 @@ def update(self, options):
     'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
     'max_depth': hp.choice('max_depth', range(5, 10)),
     'n_estimators': hp.choice('n_estimators', range(50, 200)),
-    'learning_rate': hp.lognormal('learning_rate', 0, 1),
+    'learning_rate': hp.loguniform('learning_rate', low=np.log(1e-5), high=np.log(1)),
 }
 
 adaboost_regressor_params = {
     'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
     'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
-    'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1),
+    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-5), high=np.log(1)),
+}
+
+catboost_regressor_params = {
+    'iterations': 2,
+    'depth': hp.choice('depth_catboost', range(4, 10)),
+    'learning_rate': 1,
+    'loss_function': 'RMSE',
+    'verbose': True
 }
 
 
@@ -141,7 +163,11 @@ def update(self, options):
     'adaboost': {
         'model': AdaBoostRegressor,
         'param': adaboost_regressor_params
-     }
+     },
+    'catboost': {
+        'model': CatBoostRegressor,
+        'param': catboost_regressor_params
+    }
 }
 
 
@@ -165,5 +191,9 @@ def update(self, options):
     'adaboost': {
         'model': AdaBoostClassifier,
         'param': adaboost_classifier_params
+    },
+    'catboost': {
+        'model': CatBoostClassifier,
+        'param': catboost_classifier_params
     }
 }
\ No newline at end of file
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 214940b..71034d4 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -1,3 +1,5 @@
+import collections
+
 from sklearn.base import BaseEstimator
 from abc import abstractmethod
 import numpy as np
@@ -103,18 +105,42 @@ def save_model(self):
     def get_skf(self, folds):
         pass
 
+    @staticmethod
+    def pick_diverse_estimators(trial_list, k):
+        groups = collections.defaultdict(list)
+
+        for obj in trial_list:
+            groups[obj['space']['model']].append(obj)
+        estimator_list = []
+        idx, j = 0, 0
+        while idx < k:
+            for grp in groups.values():
+                if j < len(grp):
+                    model_params = grp[j]['space']
+                    est = model_params['model'](**model_params['param'])
+                    estimator_list.append(est)
+                    idx += 1
+            j += 1
+        return estimator_list
+
     def setup_ensemble(self, trials):
-        best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False)
         # Filter the unsuccessful hparam spaces i.e. 'loss' == 1
-        best_trials = [t for t in best_trials if t['loss'] < 1]
+        best_trials = [t for t in trials.results if t['loss'] < 1]
+        best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
+
         self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials))
+
         if self.config.random_ensemble:
             np.random.shuffle(best_trials)
-        estimator_list = []
-        for i in range(self.config.num_estimators_ensemble):
-            model_params = best_trials[i]['space']
-            est = model_params['model'](**model_params['param'])
-            estimator_list.append(est)
+
+        if self.config.diverse_ensemble:
+            estimator_list = self.pick_diverse_estimators(best_trials, self.config.num_estimators_ensemble)
+        else:
+            estimator_list = []
+            for i in range(self.config.num_estimators_ensemble):
+                model_params = best_trials[i]['space']
+                est = model_params['model'](**model_params['param'])
+                estimator_list.append(est)
 
         if self.config.ensemble_strategy == 'ranked_ensembling':
             best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 6bc0013..5f3423c 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -125,7 +125,7 @@ def __init__(self, raw_x, data_info, verbose=True):
             raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
 
         self.X = raw_x
-        self.update_cat_cardinality()
+        # self.update_cat_cardinality()
 
         if self.verbose:
             print('DATA_INFO: {}'.format(self.data_info))
@@ -178,6 +178,7 @@ def refresh_col_types(self):
         self.n_cat = len(self.cat_col)
 
     def update_cat_cardinality(self):
+        # TODO: too slow make it faster
         if not self.cat_cardinality:
             self.cat_cardinality = {}
         for c in self.cat_col:
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 750cbbb..ad067d3 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -280,20 +280,19 @@ def get_box_plot(data, task_id, metric):
 
 if __name__ == "__main__":
     regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839]
-    classification_task_list = [3021]
-    # classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101,
-    #                             9914, 3020, 3524, 3573, 3962]
+    # classification_task_list = [3021]
+    classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101,
+                                9914, 3020, 3524, 3573, 3962]
     ak = BenchmarkingAutoKaggle(sess_name='test_perf')
     import time
 
     t1 = time.time()
     for _ in range(3):
         ak.run_automation(classification_task_list)
-        ak.export_results()
     t2 = time.time()
     print(t2-t1)
     # import time
     # t1 = time.time()
-    # ak.evaluate(31)
+    # ak.evaluate(3945)
     # t2 = time.time()
     # print(t2-t1)

From 1731d77ba7caf783748ffc3a6e1ba3efb148adb1 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Wed, 18 Sep 2019 15:45:04 -0500
Subject: [PATCH 21/31] CV for stacking and proba stacking

---
 autokaggle/config.py     |   5 +-
 autokaggle/ensemblers.py | 114 +++++++++++++++++++++++++++++++--------
 autokaggle/estimators.py |   6 +--
 examples/benchmarking.py |  14 ++---
 4 files changed, 104 insertions(+), 35 deletions(-)

diff --git a/autokaggle/config.py b/autokaggle/config.py
index 9b48062..b1bf225 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -13,8 +13,8 @@
 
 class Config(BaseEstimator):
     def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
-                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3,
-                 subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True):
+                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3,
+                 subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
         ensure_dir(self.path)
@@ -37,6 +37,7 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True
         # self.classification_models = ['knn', 'lgbm', 'random_forest',]
         self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost', 'catboost']
         self.diverse_ensemble = diverse_ensemble
+        self.stack_probabilities = stack_probabilities
 
     def update(self, options):
         for k, v in options.items():
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index 507fa6e..97b82d6 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -15,11 +15,27 @@
 from lightgbm import LGBMClassifier, LGBMRegressor
 import collections
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score
+import hyperopt
+from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
+from autokaggle.config import classification_hspace, regression_hspace
+
+
+lgbm_classifier_params = {
+    'n_estimators': hp.choice('n_estimators', [100, 150, 200]),
+}
+
+_classification_hspace = {
+    'lgbm': {
+        'model': LGBMClassifier,
+        'param': lgbm_classifier_params
+    },
+}
 
 
 class RankedEnsembler:
-    def __init__(self, estimator_list, ensemble_method='max_voting'):
-        self.ensemble_method = ensemble_method
+    def __init__(self, estimator_list, config):
+        self.config = config
+        self.ensemble_method = config.ensemble_method
         self.estimators = estimator_list
         
     def fit(self, X, y):
@@ -44,44 +60,96 @@ def predict(self, X):
 
 
 class StackingEnsembler:
-    def __init__(self, estimator_list, objective):
+    stacking_estimator = None
+
+    def __init__(self, estimator_list, config):
+        self.config = config
         self.estimator_list = estimator_list
-        self.objective = objective
-        if self.objective == 'regression':
-            self.stacking_estimator = LGBMRegressor(silent=False,
-                                                    verbose=-1,
-                                                    n_jobs=1,
-                                                    objective=self.objective)
-        elif self.objective == 'multiclass' or self.objective == 'binary':
-            self.stacking_estimator = LGBMClassifier(silent=False,
-                                                     verbose=-1,
-                                                     n_jobs=1,
-                                                     objective=self.objective)
+        self.objective = config.objective
+        if self.config.objective == 'regression':
+            self.hparams = hp.choice('regressor', [regression_hspace[m] for m in ['lgbm']])
+            self.config.stack_probabilities = False
+        else:
+            self.hparams = hp.choice('classifier', [_classification_hspace[m] for m in ['lgbm']])
+
+    def get_model_predictions(self, X):
+        if self.config.stack_probabilities:
+            predictions = np.zeros((len(X), 1))
+            for i, est in enumerate(self.estimator_list):
+                try:
+                    new = est.predict_proba(X)[:, :-1]
+                    predictions = np.hstack([predictions, new])
+                except AttributeError:
+                    new = np.reshape(est.predict(X), (-1, 1))
+                    predictions = np.hstack([predictions, new])
+            predictions = predictions[:, 1:]
+        else:
+            predictions = np.zeros((len(X), len(self.estimator_list)))
+            for i, est in enumerate(self.estimator_list):
+                predictions[:, i] = est.predict(X)
+        return predictions
 
     def fit(self, X, y):
         for est in self.estimator_list:
             est.fit(X, y)
-        predictions = np.zeros((len(X), len(self.estimator_list)))
-        for i, est in enumerate(self.estimator_list):
-            predictions[:, i] = est.predict(X)
+        predictions = self.get_model_predictions(X)
+        self.stacking_estimator = self.search(predictions, y)
         self.stacking_estimator.fit(predictions, y)
 
+    def search(self, x, y):
+        score_metric, skf = self.get_skf(self.config.cv_folds)
+
+        def objective_func(args):
+            clf = args['model'](**args['param'])
+            try:
+                eval_score = cross_val_score(clf, x, y, scoring=score_metric, cv=skf).mean()
+            except ValueError:
+                eval_score = 0
+            if self.config.verbose:
+                print("Ensembling CV Score:", eval_score)
+                print("\n=================")
+            return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
+
+        trials = Trials()
+        best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials,
+                    max_evals=50)
+
+        opt = space_eval(self.hparams, best)
+        best_estimator_ = opt['model'](**opt['param'])
+        if self.config.verbose:
+            print("The best hyperparameter setting found for stacking:")
+            print(opt)
+        return best_estimator_
+
     def predict(self, X):
-        predictions = np.zeros((len(X), len(self.estimator_list)))
-        for i, est in enumerate(self.estimator_list):
-            predictions[:, i] = est.predict(X)
+        predictions = self.get_model_predictions(X)
         return self.stacking_estimator.predict(predictions)
 
+    def get_skf(self, folds):
+        if self.config.objective == 'binary':
+            score_metric = 'roc_auc'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+        elif self.config.objective == 'multiclass':
+            score_metric = 'f1_weighted'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+        elif self.config.objective == 'regression':
+            score_metric = 'neg_mean_squared_error'
+            skf = KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+        else:
+            ValueError("Invalid objective")
+        return score_metric, skf
+
 
 class EnsembleSelection:
     indices_ = None
     weights_ = None
 
-    def __init__(self, estimator_list, objective, ensemble_size=25):
+    def __init__(self, estimator_list, config):
         self.estimator_list = estimator_list
-        self.objective = objective
+        self.config = config
+        self.objective = config.objective
         self.indices_, self.weights_ = [], []
-        self.ensemble_size = min(len(estimator_list), ensemble_size)
+        self.ensemble_size = len(estimator_list)
         if self.objective == 'regression':
             self.score_metric = 'neg_mean_squared_error'
             self.skf = KFold(n_splits=3, shuffle=True, random_state=1001)
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
index 71034d4..272ab5f 100644
--- a/autokaggle/estimators.py
+++ b/autokaggle/estimators.py
@@ -143,11 +143,11 @@ def setup_ensemble(self, trials):
                 estimator_list.append(est)
 
         if self.config.ensemble_strategy == 'ranked_ensembling':
-            best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
+            best_estimator_ = RankedEnsembler(estimator_list, config=self.config)
         elif self.config.ensemble_strategy == 'stacking':
-            best_estimator_ = StackingEnsembler(estimator_list, objective=self.config.objective)
+            best_estimator_ = StackingEnsembler(estimator_list, config=self.config)
         else:
-            best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method)
+            best_estimator_ = RankedEnsembler(estimator_list, config=self.config)
         return best_estimator_
     
     
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index ad067d3..50dab23 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -286,13 +286,13 @@ def get_box_plot(data, task_id, metric):
     ak = BenchmarkingAutoKaggle(sess_name='test_perf')
     import time
 
-    t1 = time.time()
-    for _ in range(3):
-        ak.run_automation(classification_task_list)
-    t2 = time.time()
-    print(t2-t1)
-    # import time
     # t1 = time.time()
-    # ak.evaluate(3945)
+    # for _ in range(3):
+    #     ak.run_automation(classification_task_list)
     # t2 = time.time()
     # print(t2-t1)
+    import time
+    t1 = time.time()
+    ak.evaluate(31)
+    t2 = time.time()
+    print(t2-t1)

From e614bfde8c43a16a303df691b9bdd4cf0ac12f4d Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 19 Sep 2019 11:13:12 -0500
Subject: [PATCH 22/31] hparam update

---
 autokaggle/config.py | 75 +++++++++++++++++++++++---------------------
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/autokaggle/config.py b/autokaggle/config.py
index b1bf225..985938a 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -46,26 +46,26 @@ def update(self, options):
 
 
 knn_classifier_params = {
-    'n_neighbors': hp.choice('n_neighbors', range(2, 20)),
-    'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']),
-    'leaf_size': hp.choice('leaf_size', range(5, 50)),
-    'metric': hp.choice('metric', ["euclidean", "manhattan", "chebyshev", "minkowski"]),
-    'p': hp.choice('p', range(1, 4)),
+    'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]),
+    'weights': hp.choice('weight_knn', ['uniform', 'distance']),
+    'metric': hp.choice('metric_knn', ["euclidean", "manhattan", "chebyshev", "minkowski"]),
+    'p': hp.choice('p_knn', range(1, 3)),
 }
 
 svc_params = {
-    'C': hp.loguniform('C', np.log(1e-5), np.log(1e5)),
-    'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']),
-    'degree': hp.choice('degree', range(1, 7)),
-    'gamma': hp.uniform('gamma', 0.001, 10000),
+    'C': hp.loguniform('C_svm', np.log(0.03125), np.log(32768)),
+    'kernel': hp.choice('kernel_svm', ['rbf', 'poly', 'sigmoid']),
+    'degree': hp.choice('degree_svm', range(2, 6)),
+    'gamma': hp.loguniform('gamma_svm', np.log(3e-5), np.log(8)),
     'max_iter': 50000,
 }
 
 random_forest_classifier_params = {
-    'criterion': hp.choice('criterion', ['entropy', 'gini']),
-    'max_features': hp.uniform('max_features', 0, 1.0),
-    'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
-    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
+    'criterion': hp.choice('criterion_rf', ['entropy', 'gini']),
+    'max_features': hp.uniform('max_features_rf', 0, 1.0),
+    'n_estimators': hp.choice('n_estimators_rf', [100, 50]),
+    'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 20)),
+    'min_samples_split': hp.choice('min_samples_split_rf', range(2, 20)),
 }
 
 lgbm_classifier_params = {
@@ -73,21 +73,21 @@ def update(self, options):
     'min_split_gain': 0.1,
     'subsample': 0.8,
     'num_leaves': 80,
-    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
-    'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
-    'max_depth': hp.choice('max_depth', range(5, 10)),
-    'n_estimators': hp.choice('n_estimators', range(50, 200)),
-    'learning_rate': hp.loguniform('learning_rate', low=np.log(1e-5), high=np.log(1)),
+    'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8),
+    'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)),
+    'max_depth': hp.choice('max_depth_lgbm', range(5, 10)),
+    'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)),
+    'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2), high=np.log(2)),
 }
 
 adaboost_classifier_params = {
     'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
-    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
-    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-5), high=np.log(1)),
+    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)),
+    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)),
 }
 
 catboost_classifier_params = {
-    'iterations': hp.choice('catboost_iterations', [5, 10]),
+    'iterations': hp.choice('iterations_catboost', [5, 10]),
     'depth': hp.choice('depth_catboost', range(4, 11)),
     'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), high=np.log(1)),
     'loss_function': hp.choice('loss_function_catboost', ['Logloss', 'CrossEntropy']),
@@ -97,11 +97,13 @@ def update(self, options):
 }
 
 extra_trees_regressor_params = {
-    'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)),
+    'n_estimators': hp.choice('n_estimators_extra_trees', [50, 100, 200]),
     'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']),
     'max_features': hp.uniform('max_features_extra_trees', 0, 1.0),
-    'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)),
-    'min_impurity_decrease': 0.0
+    'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 20)),
+    'min_samples_split': hp.choice('min_samples_split_extra_trees', range(2, 20)),
+    'min_impurity_decrease': 0.0,
+    'bootstrap': hp.choice('bootstrap_extra_trees', [True, False]),
 }
 
 ridge_params = {
@@ -111,10 +113,12 @@ def update(self, options):
 }
 
 random_forest_regressor_params = {
-    'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']),
-    'max_features': hp.uniform('max_features', 0, 1.0),
-    'n_estimators': hp.choice('rf_n_estimators', range(50, 200)),
-    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))
+    'criterion': hp.choice('criterion_rf', ['mse', 'friedman_mse', 'mae']),
+    'max_features': hp.uniform('max_features_rf', 0.1, 1.0),
+    'n_estimators': hp.choice('n_estimators_rf', [50, 100, 200]),
+    'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 10)),
+    'min_samples_split': hp.choice('min_samples_split_rf', range(2, 10)),
+    'bootstrap': hp.choice('bootstrap_rf', [True, False]),
 }
 
 lgbm_regressor_params = {
@@ -122,17 +126,18 @@ def update(self, options):
     'min_split_gain': 0.1,
     'subsample': 0.8,
     'num_leaves': 80,
-    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8),
-    'min_child_weight': hp.choice('min_child_weight', range(1, 100)),
-    'max_depth': hp.choice('max_depth', range(5, 10)),
-    'n_estimators': hp.choice('n_estimators', range(50, 200)),
-    'learning_rate': hp.loguniform('learning_rate', low=np.log(1e-5), high=np.log(1)),
+    'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8),
+    'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)),
+    'max_depth': hp.choice('max_depth_lgbm', range(5, 10)),
+    'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)),
+    'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5), high=np.log(1)),
 }
 
 adaboost_regressor_params = {
     'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
-    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)),
-    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-5), high=np.log(1)),
+    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)),
+    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)),
+    # 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)),
 }
 
 catboost_regressor_params = {

From 951e1d10399580f6f9e0a432c82f791c920468b2 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 19 Sep 2019 12:14:12 -0500
Subject: [PATCH 23/31] Add blind dataset in stacking

---
 autokaggle/ensemblers.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index 97b82d6..68cafa7 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -7,7 +7,7 @@
 import json
 from statistics import mode
 
-from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import RandomizedSearchCV, train_test_split
 from sklearn.model_selection import StratifiedKFold, KFold
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
 from joblib import dump, load
@@ -90,11 +90,12 @@ def get_model_predictions(self, X):
         return predictions
 
     def fit(self, X, y):
+        x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
         for est in self.estimator_list:
-            est.fit(X, y)
-        predictions = self.get_model_predictions(X)
-        self.stacking_estimator = self.search(predictions, y)
-        self.stacking_estimator.fit(predictions, y)
+            est.fit(x_train, y_train)
+        predictions = self.get_model_predictions(x_val)
+        self.stacking_estimator = self.search(predictions, y_val)
+        self.stacking_estimator.fit(predictions, y_val)
 
     def search(self, x, y):
         score_metric, skf = self.get_skf(self.config.cv_folds)

From 5aed1271bdff01214b030db68a681fc872b5f38b Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 19 Sep 2019 20:45:45 -0500
Subject: [PATCH 24/31] Refactor with AutoPipe

---
 autokaggle/auto_ml.py          | 252 ++++++++++++++++++++++++++++-----
 autokaggle/config.py           |  11 +-
 autokaggle/estimators.py       | 185 ------------------------
 autokaggle/preprocessor.py     |  37 +----
 examples/tabular_regression.py |   2 +-
 5 files changed, 231 insertions(+), 256 deletions(-)
 delete mode 100644 autokaggle/estimators.py

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 9fc82c2..4707212 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -1,4 +1,4 @@
-from sklearn.base import BaseEstimator
+from sklearn.base import BaseEstimator, is_classifier
 from abc import abstractmethod
 import numpy as np
 import os
@@ -8,15 +8,22 @@
 from joblib import dump, load
 
 from autokaggle.preprocessor import TabularPreprocessor
-from autokaggle.estimators import *
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+from lightgbm import LGBMClassifier, LGBMRegressor
+from autokaggle.config import Config, classification_hspace, regression_hspace
+from sklearn.model_selection import StratifiedKFold, KFold
+import hyperopt
+from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL
+from sklearn.model_selection import cross_val_score
+from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler
+from imblearn.over_sampling import SMOTE, SMOTENC
+import collections
 
 
 # TODO: Further clean the design of this file
 class AutoKaggle(BaseEstimator):
-    objective = None
-    model = None
-    preprocessor = None
+    pipeline = None
+    hparams = None
 
     def __init__(self, config=None, **kwargs):
         """
@@ -25,11 +32,8 @@ def __init__(self, config=None, **kwargs):
         self.is_trained = False
         self.config = config if config else Config()
         self.config.update(kwargs)
-        self.config.objective = self.objective
         if not self.config.path:
             self.config.path = rand_temp_folder_generator()
-            # abs_cwd = os.path.split(os.path.abspath(__file__))[0]
-        self.preprocessor = TabularPreprocessor(self.config)
 
     def fit(self, x, y, time_limit=None, data_info=None):
         """
@@ -48,34 +52,43 @@ def fit(self, x, y, time_limit=None, data_info=None):
         NOT be available for re-training.
         """
         self.config.time_limit = time_limit if time_limit else 24 * 60 * 60
+
+        # Extract or read data info
+        self.config.data_info = data_info if data_info is not None else self.extract_data_info(x)
+
+        if self.config.verbose:
+            print('DATA_INFO: {}'.format(self.config.data_info))
+            print('#TIME features: {}'.format(sum(self.config.data_info == 'TIME')))
+            print('#NUM features: {}'.format(sum(self.config.data_info == 'NUM')))
+            print('#CAT features: {}'.format(sum(self.config.data_info == 'CAT')))
         
         if x.shape[1] == 0:
             raise ValueError("No feature exist!")
 
-        while x.shape[0] < 60:
-            x = np.concatenate([x, x], axis=0)
-            y = np.concatenate([y, y], axis=0)
-            
-        # Fit Model and preprocessor
-        self.preprocessor.fit(x, y, data_info)
-        x = self.preprocessor.transform(x)
-        self.model.fit(x, y)
-        self.is_trained = True
+        x, y = self.resample(x, y)
 
-        if self.config.verbose:
-            print("The whole available data is: ")
-            print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1]))
+        if self.config.objective == 'classification':
+            n_classes = len(set(y))
+            self.config.objective = 'binary' if n_classes == 2 else 'multiclass'
+
+        # self.pipeline = AutoPipe(LGBMClassifier, {}, {}, self.config)
+        prep_space = {'prep': hp.choice('data_source', ['a', 'b'])}
+        self.pipeline = self.get_best_pipeline(self.search(x, y, prep_space, self.hparams))
+        self.pipeline.fit(x, y)
+        self.is_trained = True
 
-    def predict(self, x_test, predict_proba=False):
+    def predict(self, x_test):
         """
         This function should provide predictions of labels on (test) data.
         The function predict eventually can return probabilities or continuous values.
         """
-        x_test = self.preprocessor.transform(x_test)
-        if predict_proba:
-            y = self.model.predict_proba(x_test, )
-        else:
-            y = self.model.predict(x_test, )
+        y = self.pipeline.predict(x_test)
+        if y is None:
+            raise ValueError("Tabular predictor does not exist")
+        return y
+
+    def predict_proba(self, x_test):
+        y = self.pipeline.predict_proba(x_test)
         if y is None:
             raise ValueError("Tabular predictor does not exist")
         return y
@@ -94,21 +107,192 @@ def evaluate(self, x_test, y_test):
         return results
 
     def final_fit(self, x_train, y_train):
-        x_train = self.preprocessor.transform(x_train)
-        self.model.fit(x_train, y_train)
+        self.pipeline.fit(x_train, y_train)
 
+    def resample(self, x, y):
+        if self.config.balance_class_dist:
+            x, y = SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(x, y)
+        while x.shape[0] < 60:
+            x = np.concatenate([x, x], axis=0)
+            y = np.concatenate([y, y], axis=0)
+        return x, y
 
-class AutoKaggleClassifier(AutoKaggle):
-    objective = 'classification'
+    def subsample(self, x, y, sample_percent):
+        # TODO: Add way to balance the subsample
+        # Set small sample for hyper-param search
+        if x.shape[0] > 600:
+            grid_train_percentage = max(600.0 / x.shape[0], sample_percent)
+        else:
+            grid_train_percentage = 1
+        grid_n = int(x.shape[0] * grid_train_percentage)
+        idx = random.sample(list(range(x.shape[0])), grid_n)
+        grid_train_x, grid_train_y = x[idx, :], y[idx]
+        return grid_train_x, grid_train_y
+
+    def search(self, x, y, prep_space, model_space):
+        grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio)
+        score_metric, skf = self.get_skf(self.config.cv_folds)
+
+        def objective_func(params):
+            model_class = params['estimator']['model']
+            m_params = params['estimator']['param']
+            p_params = params['prep']
+            pipeline = AutoPipe(model_class=model_class, m_params=m_params, p_params=p_params, config=self.config)
+            try:
+                eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
+                status = STATUS_OK
+            except ValueError:
+                eval_score = float('-inf')
+                status = STATUS_FAIL
+            if self.config.verbose:
+                print("CV Score:", eval_score)
+                print("\n=================")
+            loss = 1 - eval_score if status == STATUS_OK else float('inf')
+            return {'loss': loss, 'status': status, 'model_class': model_class, 'm_params': m_params,
+                    'p_params': p_params}
+
+        trials = Trials()
+        search_space = {'prep': prep_space, 'estimator': model_space}
+        _ = fmin(objective_func, search_space, algo=hyperopt.rand.suggest, trials=trials,
+                 max_evals=self.config.search_iter)
+        return trials
+
+    def get_best_pipeline(self, trials):
+        if self.config.use_ensembling:
+            best_pipeline = self.setup_ensemble(trials)
+        else:
+            opt = trials.best_trial['result']
+            best_pipeline = AutoPipe(opt['model_class'], opt['m_params'], opt['p_params'], self.config)
+            if self.config.verbose:
+                print("The best hyperparameter setting found:")
+                print(opt)
+        return best_pipeline
+
+    @abstractmethod
+    def get_skf(self, folds):
+        pass
+
+    def pick_diverse_estimators(self, trial_list, k):
+        groups = collections.defaultdict(list)
 
+        for obj in trial_list:
+            groups[obj['model_class']].append(obj)
+        estimator_list = []
+        idx, j = 0, 0
+        while idx < k:
+            for grp in groups.values():
+                if j < len(grp):
+                    est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'], grp[j]['p_params'], self.config)
+                    estimator_list.append(est)
+                    idx += 1
+            j += 1
+        return estimator_list
+
+    def setup_ensemble(self, trials):
+        # Filter the unsuccessful hparam spaces i.e. 'loss' == float('inf')
+        best_trials = [t for t in trials.results if t['loss'] != float('inf')]
+        best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
+
+        self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials))
+
+        if self.config.random_ensemble:
+            np.random.shuffle(best_trials)
+
+        if self.config.diverse_ensemble:
+            estimator_list = self.pick_diverse_estimators(best_trials, self.config.num_estimators_ensemble)
+        else:
+            estimator_list = []
+            for i in range(self.config.num_estimators_ensemble):
+                est = AutoPipe(best_trials[i]['model_class'], best_trials[i]['m_params'], best_trials[i]['p_params'],
+                               self.config)
+                estimator_list.append(est)
+
+        if self.config.ensemble_strategy == 'stacking':
+            best_estimator_ = StackingEnsembler(estimator_list, config=self.config)
+        else:
+            best_estimator_ = RankedEnsembler(estimator_list, config=self.config)
+        return best_estimator_
+
+    @staticmethod
+    def extract_data_info(raw_x):
+        """
+        This function extracts the data info automatically based on the type of each feature in raw_x.
+
+        Args:
+            raw_x: a numpy.ndarray instance containing the training data.
+        """
+        data_info = []
+        row_num, col_num = raw_x.shape
+        for col_idx in range(col_num):
+            try:
+                raw_x[:, col_idx].astype(np.float)
+                data_info.append('NUM')
+            except:
+                data_info.append('CAT')
+        return np.array(data_info)
+
+
+class AutoKaggleClassifier(AutoKaggle):
     def __init__(self, config=None, **kwargs):
         super().__init__(config, **kwargs)
-        self.model = Classifier(self.config)
+        self.config.objective = 'classification'
+        self.hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models])
 
+    def get_skf(self, folds):
+        if self.config.objective == 'binary':
+            score_metric = 'roc_auc'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+        else:
+            score_metric = 'f1_weighted'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+        return score_metric, skf
 
-class AutoKaggleRegressor(AutoKaggle):
-    objective = 'regression'
 
+class AutoKaggleRegressor(AutoKaggle):
     def __init__(self, config=None, **kwargs):
         super().__init__(config, **kwargs)
-        self.model = Regressor(self.config)
+        self.config.objective = 'regression'
+        self.hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models])
+
+    def get_skf(self, folds):
+        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+
+
+class AutoPipe(BaseEstimator):
+    prep = None
+    model = None
+    config = None
+    m_params = None
+    p_params = None
+    model_class = None
+
+    def __init__(self, model_class, m_params, p_params, config):
+        self.config = config
+        self.m_params = m_params
+        self.p_params = p_params
+        self.model_class = model_class
+        self._estimator_type = 'classifier' if is_classifier(model_class) else 'regressor'
+
+    def fit(self, x, y):
+        self.prep = TabularPreprocessor(self.config)
+        self.model = self.model_class(**self.m_params)
+        x = self.prep.fit_transform(x, y)
+        self.model.fit(x, y)
+
+    def predict(self, x):
+        x = self.prep.transform(x)
+        return self.model.predict(x)
+
+    def predict_proba(self, x):
+        x = self.prep.transform(x)
+        try:
+            return self.model.predict_proba(x)
+        except AttributeError:
+            return self.model.predict(x)
+
+    def decision_function(self, x):
+        x = self.prep.transform(x)
+        try:
+            return self.model.decision_function(x)
+        except AttributeError:
+            raise AttributeError
diff --git a/autokaggle/config.py b/autokaggle/config.py
index 985938a..d9be2ef 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -11,10 +11,11 @@
 import numpy as np
 
 
-class Config(BaseEstimator):
-    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
-                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3,
-                 subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False):
+class Config:
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=5,
+                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3,
+                 subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False,
+                 data_info=None, balance_class_dist=False):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
         ensure_dir(self.path)
@@ -38,6 +39,8 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True
         self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost', 'catboost']
         self.diverse_ensemble = diverse_ensemble
         self.stack_probabilities = stack_probabilities
+        self.data_info = data_info
+        self.balance_class_dist = balance_class_dist
 
     def update(self, options):
         for k, v in options.items():
diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py
deleted file mode 100644
index 272ab5f..0000000
--- a/autokaggle/estimators.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import collections
-
-from sklearn.base import BaseEstimator
-from abc import abstractmethod
-import numpy as np
-import os
-import random
-import json
-
-from sklearn.model_selection import RandomizedSearchCV, cross_val_score
-from sklearn.model_selection import StratifiedKFold, KFold
-
-from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer
-from joblib import dump, load
-
-from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
-from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler
-from autokaggle.config import Config, classification_hspace, regression_hspace
-import hyperopt
-from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
-from imblearn.over_sampling import SMOTE, SMOTENC
-
-
-class TabularEstimator(BaseEstimator):
-    def __init__(self, config=None, **kwargs):
-        """
-        Initialization function for tabular supervised learner.
-        """
-        self.config = config if config else Config()
-        self.best_estimator_ = None
-        self.hparams = None
-
-    def fit(self, x, y):
-        if self.config.objective == 'classification':
-            n_classes = len(set(y))
-            self.config.objective = 'binary' if n_classes == 2 else 'multiclass'
-        # x, y = self.resample(x, y)
-        self.best_estimator_, _ = self.search(x, y)
-        self.best_estimator_.fit(x, y)
-        self.save_model()
-    
-    def predict(self, x, y=None):
-        y_pred = self.best_estimator_.predict(x, )
-        return y_pred
-
-    def predict_proba(self, x, y=None):
-        try:
-            y_pred = self.best_estimator_.predict_proba(x, )
-        except:
-            y_pred = self.best_estimator_.predict(x, )
-        return y_pred
-
-    def resample(self, X, y):
-        return SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(X, y)
-
-    @staticmethod
-    def subsample(x, y, sample_percent):
-        # TODO: Add way to balance the subsample
-        # Set small sample for hyper-param search
-        if x.shape[0] > 600:
-            grid_train_percentage = max(600.0 / x.shape[0], sample_percent)
-        else:
-            grid_train_percentage = 1
-        grid_n = int(x.shape[0] * grid_train_percentage)
-        idx = random.sample(list(range(x.shape[0])), grid_n)
-        grid_train_x, grid_train_y = x[idx, :], y[idx]
-        return grid_train_x, grid_train_y
-
-    def search(self, x, y):
-        grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio)
-        score_metric, skf = self.get_skf(self.config.cv_folds)
-
-        def objective_func(params):
-            args = params['estimator']
-            clf = args['model'](**args['param'])
-            try:
-                eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
-            except ValueError:
-                eval_score = 0
-            if self.config.verbose:
-                print("CV Score:", eval_score)
-                print("\n=================")
-            return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
-
-        trials = Trials()
-        search_space = {'source': hp.choice('data_source', ['a', 'b']), 'estimator': self.hparams}
-        best = fmin(objective_func, search_space, algo=hyperopt.rand.suggest, trials=trials,
-                    max_evals=self.config.search_iter)
-
-        if self.config.use_ensembling:
-            best_estimator_ = self.setup_ensemble(trials)
-        else:
-            opt = space_eval(self.hparams, best)
-            best_estimator_ = opt['model'](**opt['param'])
-            if self.config.verbose:
-                print("The best hyperparameter setting found:")
-                print(opt)
-        return best_estimator_, trials
-            
-    @abstractmethod
-    def save_model(self):
-        pass
-    
-    @abstractmethod
-    def get_skf(self, folds):
-        pass
-
-    @staticmethod
-    def pick_diverse_estimators(trial_list, k):
-        groups = collections.defaultdict(list)
-
-        for obj in trial_list:
-            groups[obj['space']['model']].append(obj)
-        estimator_list = []
-        idx, j = 0, 0
-        while idx < k:
-            for grp in groups.values():
-                if j < len(grp):
-                    model_params = grp[j]['space']
-                    est = model_params['model'](**model_params['param'])
-                    estimator_list.append(est)
-                    idx += 1
-            j += 1
-        return estimator_list
-
-    def setup_ensemble(self, trials):
-        # Filter the unsuccessful hparam spaces i.e. 'loss' == 1
-        best_trials = [t for t in trials.results if t['loss'] < 1]
-        best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
-
-        self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials))
-
-        if self.config.random_ensemble:
-            np.random.shuffle(best_trials)
-
-        if self.config.diverse_ensemble:
-            estimator_list = self.pick_diverse_estimators(best_trials, self.config.num_estimators_ensemble)
-        else:
-            estimator_list = []
-            for i in range(self.config.num_estimators_ensemble):
-                model_params = best_trials[i]['space']
-                est = model_params['model'](**model_params['param'])
-                estimator_list.append(est)
-
-        if self.config.ensemble_strategy == 'ranked_ensembling':
-            best_estimator_ = RankedEnsembler(estimator_list, config=self.config)
-        elif self.config.ensemble_strategy == 'stacking':
-            best_estimator_ = StackingEnsembler(estimator_list, config=self.config)
-        else:
-            best_estimator_ = RankedEnsembler(estimator_list, config=self.config)
-        return best_estimator_
-    
-    
-class Classifier(TabularEstimator):
-    """Classifier class.
-     It is used for tabular data classification.
-    """ 
-    def __init__(self, config=None, **kwargs):
-        super().__init__(config, **kwargs)
-        self.config.objective = 'classification'
-        # TODO: add choice to the set of estimators
-        self.hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models])
-
-    def get_skf(self, folds):
-        if self.config.objective == 'binary':
-            score_metric = 'roc_auc'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
-        else:
-            score_metric = 'f1_weighted'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
-        return score_metric, skf
-    
-    
-class Regressor(TabularEstimator):
-    """Regressor class.
-    It is used for tabular data regression.
-    """
-    def __init__(self, config=None, **kwargs):
-        super().__init__(config, **kwargs)
-        self.config.objective = 'regression'
-        # TODO: add choice to the set of estimators
-        self.hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models])
-
-    def get_skf(self, folds):
-        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 5f3423c..24781f2 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -16,7 +16,7 @@
 LEVEL_HIGH = 32
 
 
-class TabularPreprocessor:
+class TabularPreprocessor(TransformerMixin):
     def __init__(self, config):
         """
         Initialization function for tabular preprocessor.
@@ -25,7 +25,7 @@ def __init__(self, config):
         self.pipeline = None
         self.config = config
 
-    def fit(self, raw_x, y, data_info):
+    def fit(self, raw_x, y):
         """
         This function should train the model parameters.
 
@@ -38,16 +38,13 @@ def fit(self, raw_x, y, data_info):
                      'TIME' for temporal feature, 'NUM' for other numerical feature,
                      and 'CAT' for categorical feature.
         """
-        # Extract or read data info
-        self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x)
-
-        data = TabularData(raw_x, self.data_info, self.config.verbose)
+        data = TabularData(raw_x, self.config.data_info, self.config.verbose)
 
         self.pipeline = Pipeline([
             ('imputer', Imputation(selected_type='ALL', operation='upd')),
             # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')),
             # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')),
-            ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')),
+            ('target_encoder', TargetEncoder(selected_type='CAT', operation='upd')),
             # ('count_encoder', CatCount(selected_type='CAT', operation='upd')),
             # ('one_hot_encoder', OneHot(selected_type='CAT', operation='upd')),
             # ('label_encoder', LabelEncode(selected_type='CAT', operation='upd')),
@@ -78,28 +75,10 @@ def transform(self, raw_x):
         NOT be available for re-training.
         """
         # Get Meta-Feature
-        data = TabularData(raw_x, self.data_info, self.config.verbose)
+        data = TabularData(raw_x, self.config.data_info, self.config.verbose)
         a = self.pipeline.transform(data).X
         return a.values
 
-    @staticmethod
-    def extract_data_info(raw_x):
-        """
-        This function extracts the data info automatically based on the type of each feature in raw_x.
-
-        Args:
-            raw_x: a numpy.ndarray instance containing the training data.
-        """
-        data_info = []
-        row_num, col_num = raw_x.shape
-        for col_idx in range(col_num):
-            try:
-                raw_x[:, col_idx].astype(np.float)
-                data_info.append('NUM')
-            except:
-                data_info.append('CAT')
-        return np.array(data_info)
-
 
 class TabularData:
     cat_col = None
@@ -127,12 +106,6 @@ def __init__(self, raw_x, data_info, verbose=True):
         self.X = raw_x
         # self.update_cat_cardinality()
 
-        if self.verbose:
-            print('DATA_INFO: {}'.format(self.data_info))
-            print('#TIME features: {}'.format(self.n_time))
-            print('#NUM features: {}'.format(self.n_num))
-            print('#CAT features: {}'.format(self.n_cat))
-
     def update_type(self, columns, new_type):
         for c in columns:
             self.data_info[c] = new_type
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index 885160c..ec5af7a 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -18,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggleRegressor('regression')
+    clf = AutoKaggleRegressor()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 

From 4749d5cd03cee5fa1404b5b953e43eecb4c5fb4a Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 19 Sep 2019 21:53:51 -0500
Subject: [PATCH 25/31] Fix higher order primitives

---
 autokaggle/__init__.py     |   1 -
 autokaggle/preprocessor.py | 130 +++++++++++++++++++++++--------------
 2 files changed, 83 insertions(+), 48 deletions(-)

diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py
index 1a975c9..727b31e 100644
--- a/autokaggle/__init__.py
+++ b/autokaggle/__init__.py
@@ -1,3 +1,2 @@
 from autokaggle.auto_ml import AutoKaggleClassifier, AutoKaggleRegressor
-from autokaggle.estimators import *
 from autokaggle.ensemblers import *
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 24781f2..ebb3f43 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -12,17 +12,17 @@
 from abc import abstractmethod
 import collections
 from lightgbm import LGBMClassifier, LGBMRegressor
-from autokaggle.estimators import Config
 LEVEL_HIGH = 32
 
 
 class TabularPreprocessor(TransformerMixin):
+    pipeline = None
+    data_info = None
+
     def __init__(self, config):
         """
         Initialization function for tabular preprocessor.
         """
-        self.data_info = None
-        self.pipeline = None
         self.config = config
 
     def fit(self, raw_x, y):
@@ -41,23 +41,23 @@ def fit(self, raw_x, y):
         data = TabularData(raw_x, self.config.data_info, self.config.verbose)
 
         self.pipeline = Pipeline([
-            ('imputer', Imputation(selected_type='ALL', operation='upd')),
-            # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')),
-            # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')),
-            ('target_encoder', TargetEncoder(selected_type='CAT', operation='upd')),
-            # ('count_encoder', CatCount(selected_type='CAT', operation='upd')),
-            # ('one_hot_encoder', OneHot(selected_type='CAT', operation='upd')),
-            # ('label_encoder', LabelEncode(selected_type='CAT', operation='upd')),
-            ('scaler', TabScaler(selected_type='NUM', operation='upd')),
-            # ('boxcox', BoxCox(selected_type='NUM', operation='upd')),
-            # ('log_transform', LogTransform(selected_type='NUM', operation='upd')),
-            # ('binning', Binning(selected_type='NUM', operation='upd')),
-            # ('pca', TabPCA(selected_type='NUM', operation='add')),
-            # ('time_diff', TimeDiff(selected_type='TIME', operation='add')),
-            # ('time_offset', TimeOffset(selected_type='TIME', operation='upd')),
-            ('filter', FilterConstant(selected_type='ALL', operation='del')),
-            # ('pearson_corr', FeatureFilter(selected_type='ALL', operation='del')),
-            # ('lgbm_feat_selection', FeatureImportance(selected_type='ALL', operation='del')),
+            ('imputer', Imputation(operation='upd', selected_type='ALL')),
+            # ('cat_num_encoder', CatNumEncoder(operation='add', selected_type1='CAT', selected_type2='NUM')),
+            # ('cat_cat_encoder', CatCatEncoder(operation='add', selected_type1='CAT', selected_type2='CAT')),
+            ('target_encoder', TargetEncoder(operation='upd', selected_type='CAT')),
+            # ('count_encoder', CatCount(operation='upd', selected_type='CAT')),
+            # ('one_hot_encoder', OneHot(operation='upd', selected_type='CAT')),
+            # ('label_encoder', LabelEncode(operation='upd', selected_type='CAT')),
+            ('scaler', TabScaler(operation='upd', selected_type='NUM')),
+            # ('boxcox', BoxCox(operation='upd', selected_type='NUM')),
+            # ('log_transform', LogTransform(operation='upd', selected_type='NUM')),
+            # ('binning', Binning(operation='upd', selected_type='NUM')),
+            # ('pca', TabPCA(operation='add', selected_type='NUM')),
+            # ('time_diff', TimeDiff(operation='add', selected_type='TIME')),
+            # ('time_offset', TimeOffset(operation='upd', selected_type='TIME')),
+            ('filter', FilterConstant(operation='del', selected_type='ALL')),
+            # ('pearson_corr', FeatureFilter(operation='del', selected_type='ALL')),
+            # ('lgbm_feat_selection', FeatureImportance(operation='del', selected_type='ALL')),
         ])
         self.pipeline.fit(data, y)
 
@@ -179,7 +179,7 @@ class Primitive(BaseEstimator, TransformerMixin):
     supported_ops = ('add', 'upd', 'del')
     name_key = ''
 
-    def __init__(self, selected_type=None, operation='upd', **kwargs):
+    def __init__(self, operation='upd', selected_type=None, **kwargs):
         self.selected_type = selected_type
         self.operation = operation
         self.init_vars(**kwargs)
@@ -211,10 +211,48 @@ def _transform(self, data, y=None):
         pass
 
 
-class PrimitiveHigherOrder(Primitive):
-    def __init__(self, operation='upd', selected_type=None, selected_type2=None, **kwargs):
-        super().__init__(selected_type, operation, **kwargs)
+class PrimitiveHigherOrder:
+    selected_1 = None
+    selected_2 = None
+    drop_columns = None
+    options = None
+    supported_ops = ('add', 'upd', 'del')
+    name_key = ''
+
+    def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs):
+        self.operation = operation
+        self.selected_type1 = selected_type1
         self.selected_type2 = selected_type2
+        self.init_vars(**kwargs)
+        self.name_key = self.__class__.__name__
+
+    def init_vars(self, **kwargs):
+        self.options = kwargs
+
+    def fit(self, data, y=None):
+        self.selected_1 = data.select_columns(self.selected_type1)
+        self.selected_2 = data.select_columns(self.selected_type2)
+
+        if self.operation not in self.supported_ops:
+            print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__))
+            self.selected_1 = None
+            self.selected_2 = None
+        if not self.selected_1 or not self.selected_2:
+            return self
+        return self._fit(data, y)
+
+    def transform(self, data, y=None):
+        if not self.selected_1 or not self.selected_2:
+            return data
+        return self._transform(data, y)
+
+    @abstractmethod
+    def _fit(self, data, y=None):
+        pass
+
+    @abstractmethod
+    def _transform(self, data, y=None):
+        pass
 
 
 class TabScaler(Primitive):
@@ -345,7 +383,9 @@ def _transform(self, data, y=None):
 
 
 class CatCatEncoder(PrimitiveHigherOrder):
-    supported_ops = ('add', 'upd')
+    supported_ops = ('add', )
+    cat_cat_map = None
+
     @staticmethod
     def cat_cat_count(df, col1, col2, strategy='count'):
         if strategy == 'count':
@@ -359,28 +399,23 @@ def cat_cat_count(df, col1, col2, strategy='count'):
     def _fit(self, data, y=None):
         self.cat_cat_map = {}
         self.strategy = self.options.get('strategy', 'count')
-        for col1, col2 in itertools.combinations(self.selected, 2):
+        self.selected_1 = list(set(self.selected_1 + self.selected_2))
+        for col1, col2 in itertools.combinations(self.selected_1, 2):
             self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(data.X, col1, col2, self.strategy)
         return self
 
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
-        for col1, col2 in itertools.combinations(self.selected, 2):
+        for col1, col2 in itertools.combinations(self.selected_1, 2):
             if col1 + '_cross_' + col2 in self.cat_cat_map:
                 x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2])
-        # TODO: decide new_type
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
 class CatNumEncoder(PrimitiveHigherOrder):
-    supported_ops = ('add', 'upd')
-
-    def __init__(self, selected_type=None, selected_num=[], operation='add', strategy='mean'):
-        super().__init__(selected_type, operation)
-        self.selected_num = selected_num
-        self.strategy = strategy
-        self.cat_num_map = {}
+    supported_ops = ('add', )
+    cat_num_map = None
 
     @staticmethod
     def cat_num_interaction(df, col1, col2, method='mean'):
@@ -400,23 +435,24 @@ def cat_num_interaction(df, col1, col2, method='mean'):
     def _fit(self, data, y=None):
         self.cat_num_map = {}
         self.strategy = self.options.get('strategy', 'mean')
-        for col1 in self.selected:
-            for col2 in self.selected_num:
+        for col1 in self.selected_1:
+            for col2 in self.selected_2:
                 self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(data.X, col1, col2, self.strategy)
         return self
 
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
-        for col1 in self.selected:
-            for col2 in self.selected_num:
+        for col1 in self.selected_1:
+            for col2 in self.selected_2:
                 if col1 + '_cross_' + col2 in self.cat_num_map:
                     x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_num_map[col1 + '_cross_' + col2])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 
 class CatBinEncoder(PrimitiveHigherOrder):
-    supported_ops = ('add', 'upd')
+    supported_ops = ('add', )
+    cat_bin_map = None
 
     @staticmethod
     def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
@@ -431,18 +467,18 @@ def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
     def _fit(self, data, y=None):
         self.cat_bin_map = {}
         self.strategy = self.options.get('strategy', 'percent_true')
-        for col1 in self.selected:
-            for col2 in self.selected_bin:
+        for col1 in self.selected_1:
+            for col2 in self.selected_2:
                 self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(data.X, col1, col2, self.strategy)
         return self
 
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
-        for col1 in self.selected:
-            for col2 in self.selected_bin:
+        for col1 in self.selected_1:
+            for col2 in self.selected_2:
                 if col1 + '_cross_' + col2 in self.cat_bin_map:
                     x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key)
         return data
 
 

From f4df7fd24f7ea1abe37d4288743a48d5531efdd2 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Fri, 20 Sep 2019 01:22:25 -0500
Subject: [PATCH 26/31] Add params to preprocessor

---
 autokaggle/auto_ml.py      |   2 +-
 autokaggle/config.py       |   4 +-
 autokaggle/preprocessor.py | 153 +++++++++++++++++++++++++++++++------
 3 files changed, 131 insertions(+), 28 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 4707212..c7c021a 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -274,7 +274,7 @@ def __init__(self, model_class, m_params, p_params, config):
         self._estimator_type = 'classifier' if is_classifier(model_class) else 'regressor'
 
     def fit(self, x, y):
-        self.prep = TabularPreprocessor(self.config)
+        self.prep = TabularPreprocessor(self.config, self.p_params)
         self.model = self.model_class(**self.m_params)
         x = self.prep.fit_transform(x, y)
         self.model.fit(x, y)
diff --git a/autokaggle/config.py b/autokaggle/config.py
index d9be2ef..4e780ff 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -12,8 +12,8 @@
 
 
 class Config:
-    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=5,
-                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3,
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
+                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3,
                  subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False,
                  data_info=None, balance_class_dist=False):
         self.verbose = verbose
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index ebb3f43..258374a 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -18,12 +18,15 @@
 class TabularPreprocessor(TransformerMixin):
     pipeline = None
     data_info = None
+    params = None
+    config = None
 
-    def __init__(self, config):
+    def __init__(self, config, params):
         """
         Initialization function for tabular preprocessor.
         """
         self.config = config
+        self.params = params
 
     def fit(self, raw_x, y):
         """
@@ -40,25 +43,15 @@ def fit(self, raw_x, y):
         """
         data = TabularData(raw_x, self.config.data_info, self.config.verbose)
 
-        self.pipeline = Pipeline([
-            ('imputer', Imputation(operation='upd', selected_type='ALL')),
-            # ('cat_num_encoder', CatNumEncoder(operation='add', selected_type1='CAT', selected_type2='NUM')),
-            # ('cat_cat_encoder', CatCatEncoder(operation='add', selected_type1='CAT', selected_type2='CAT')),
-            ('target_encoder', TargetEncoder(operation='upd', selected_type='CAT')),
-            # ('count_encoder', CatCount(operation='upd', selected_type='CAT')),
-            # ('one_hot_encoder', OneHot(operation='upd', selected_type='CAT')),
-            # ('label_encoder', LabelEncode(operation='upd', selected_type='CAT')),
-            ('scaler', TabScaler(operation='upd', selected_type='NUM')),
-            # ('boxcox', BoxCox(operation='upd', selected_type='NUM')),
-            # ('log_transform', LogTransform(operation='upd', selected_type='NUM')),
-            # ('binning', Binning(operation='upd', selected_type='NUM')),
-            # ('pca', TabPCA(operation='add', selected_type='NUM')),
-            # ('time_diff', TimeDiff(operation='add', selected_type='TIME')),
-            # ('time_offset', TimeOffset(operation='upd', selected_type='TIME')),
-            ('filter', FilterConstant(operation='del', selected_type='ALL')),
-            # ('pearson_corr', FeatureFilter(operation='del', selected_type='ALL')),
-            # ('lgbm_feat_selection', FeatureImportance(operation='del', selected_type='ALL')),
-        ])
+        steps = []
+        steps.extend(self.get_imputation_pipeline(self.params))
+        steps.extend(self.get_higher_order_pipeline(self.params))
+        steps.extend(self.get_categorical_pipeline(self.params))
+        steps.extend(self.get_numerical_pipeline(self.params))
+        steps.extend(self.get_time_pipeline(self.params))
+        steps.extend(self.get_filtering_pipeline(self.params))
+        self.pipeline = Pipeline(steps)
+
         self.pipeline.fit(data, y)
 
         return self
@@ -79,6 +72,94 @@ def transform(self, raw_x):
         a = self.pipeline.transform(data).X
         return a.values
 
+    @staticmethod
+    def get_categorical_pipeline(params):
+        choice = params.get('cat_encoding', 'target')
+        cat_pipeline = []
+        if choice == 'target':
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='upd', selected_type='CAT')))
+        elif choice == 'label':
+            cat_pipeline.append(('label_encoder', LabelEncode(operation='upd', selected_type='CAT')))
+        elif choice == 'count':
+            cat_pipeline.append(('count_encoder', CatCount(operation='upd', selected_type='CAT')))
+        elif choice == 'target+count':
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', selected_type='CAT')))
+            cat_pipeline.append(('count_encoder', CatCount(operation='upd', selected_type='CAT')))
+        elif choice == 'one_hot':
+            cat_pipeline.append(('one_hot_encoder', OneHot(operation='upd', selected_type='CAT')))
+        elif choice == 'target+label':
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', selected_type='CAT')))
+            cat_pipeline.append(('label_encoder', LabelEncode(operation='upd', selected_type='CAT')))
+        else:
+            raise ValueError
+        return cat_pipeline
+
+    @staticmethod
+    def get_numerical_pipeline(params):
+        scaling = params.get('scaling', True)
+        log_transform = params.get('log_transform', False)
+        power_transform = params.get('power_transform', False)
+        pca = params.get('pca', False)
+        binning = params.get('binning', False)
+
+        numeric_pipeline = []
+        if scaling:
+            numeric_pipeline.append(('scaler', TabScaler(operation='upd', selected_type='NUM')))
+        if log_transform:
+            numeric_pipeline.append(('log_transform', LogTransform(operation='upd', selected_type='NUM')))
+        if power_transform:
+            numeric_pipeline.append(('boxcox', BoxCox(operation='upd', selected_type='NUM')))
+        if pca:
+            numeric_pipeline.append(('pca', TabPCA(operation='add', selected_type='NUM')))
+        if binning:
+            numeric_pipeline.append(('binning', Binning(operation='add', selected_type='NUM')))
+        return numeric_pipeline
+
+    def get_filtering_pipeline(self, params):
+        pearson_thresh = params.get('pearson_thresh', 0)
+        feat_importance_thresh = params.get('feat_importance_thresh', 0)
+
+        filter_pipeline = [('filter', FilterConstant(operation='del', selected_type='ALL'))]
+        if pearson_thresh > 0:
+            filter_pipeline.append(('pearson_corr', FeatureFilter(operation='del', selected_type='ALL',
+                                                                  threshold=pearson_thresh)))
+        if feat_importance_thresh > 0:
+            filter_pipeline.append(('lgbm_feat_selection', FeatureImportance(operation='del',
+                                                                             selected_type='ALL',
+                                                                             threshold=feat_importance_thresh,
+                                                                             task_type=self.config.objective)))
+        return filter_pipeline
+
+    @staticmethod
+    def get_time_pipeline(params):
+        add_offset = params.get('add_time_offset', False)
+        add_diff = params.get('add_time_diff', False)
+        time_pipeline = []
+        if add_offset:
+            time_pipeline.append(('time_offset', TimeOffset(operation='upd', selected_type='TIME')))
+        if add_diff:
+            time_pipeline.append(('time_diff', TimeDiff(operation='add', selected_type='TIME')))
+        return time_pipeline
+
+    @staticmethod
+    def get_imputation_pipeline(params):
+        strategy = params.get('imputation_strategy', 'most_frequent')
+        impute_pipeline = [('imputer', Imputation(operation='upd', selected_type='ALL', strategy=strategy))]
+        return impute_pipeline
+
+    @staticmethod
+    def get_higher_order_pipeline(params):
+        cat_num_strategy = params.get('cat_num_strategy', None)
+        cat_cat_strategy = params.get('cat_cat_strategy', None)
+        pipeline = []
+        if cat_num_strategy:
+            pipeline.append(('cat_num_encoder', CatNumEncoder(operation='add', selected_type1='CAT',
+                                                              selected_type2='NUM', strategy=cat_num_strategy)))
+        if cat_cat_strategy:
+            pipeline.append(('cat_cat_encoder', CatCatEncoder(operation='add', selected_type1='CAT',
+                                                              selected_type2='CAT', strategy=cat_cat_strategy)))
+        return pipeline
+
 
 class TabularData:
     cat_col = None
@@ -385,6 +466,10 @@ def _transform(self, data, y=None):
 class CatCatEncoder(PrimitiveHigherOrder):
     supported_ops = ('add', )
     cat_cat_map = None
+    strategy = None
+
+    def init_vars(self, strategy='count'):
+        self.strategy = strategy
 
     @staticmethod
     def cat_cat_count(df, col1, col2, strategy='count'):
@@ -398,7 +483,6 @@ def cat_cat_count(df, col1, col2, strategy='count'):
 
     def _fit(self, data, y=None):
         self.cat_cat_map = {}
-        self.strategy = self.options.get('strategy', 'count')
         self.selected_1 = list(set(self.selected_1 + self.selected_2))
         for col1, col2 in itertools.combinations(self.selected_1, 2):
             self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(data.X, col1, col2, self.strategy)
@@ -416,6 +500,10 @@ def _transform(self, data, y=None):
 class CatNumEncoder(PrimitiveHigherOrder):
     supported_ops = ('add', )
     cat_num_map = None
+    strategy = None
+
+    def init_vars(self, strategy='mean'):
+        self.strategy = strategy
 
     @staticmethod
     def cat_num_interaction(df, col1, col2, method='mean'):
@@ -434,7 +522,6 @@ def cat_num_interaction(df, col1, col2, method='mean'):
 
     def _fit(self, data, y=None):
         self.cat_num_map = {}
-        self.strategy = self.options.get('strategy', 'mean')
         for col1 in self.selected_1:
             for col2 in self.selected_2:
                 self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(data.X, col1, col2, self.strategy)
@@ -453,6 +540,10 @@ def _transform(self, data, y=None):
 class CatBinEncoder(PrimitiveHigherOrder):
     supported_ops = ('add', )
     cat_bin_map = None
+    strategy = None
+
+    def init_vars(self, strategy='percent_true'):
+        self.strategy = strategy
 
     @staticmethod
     def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
@@ -466,7 +557,6 @@ def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
 
     def _fit(self, data, y=None):
         self.cat_bin_map = {}
-        self.strategy = self.options.get('strategy', 'percent_true')
         for col1 in self.selected_1:
             for col2 in self.selected_2:
                 self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(data.X, col1, col2, self.strategy)
@@ -578,12 +668,21 @@ def _transform(self, data, y=None):
 class Imputation(Primitive):
     impute_dict = None
     supported_ops = ('add', 'upd')
+    strategy = None
+
+    def init_vars(self, strategy='most_frequent'):
+        self.strategy = strategy
 
     def _fit(self, data, y=None):
         self.impute_dict = {}
         for col in self.selected:
-            value_counts = data.X[col].value_counts()
-            self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0
+            if self.strategy == 'most_frequent':
+                value_counts = data.X[col].value_counts()
+                self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0
+            elif self.strategy == 'zero':
+                self.impute_dict[col] = 0
+            else:
+                raise ValueError
         return self
 
     def _transform(self, data, y=None):
@@ -599,6 +698,8 @@ class FeatureFilter(Primitive):
     supported_ops = ('del',)
 
     def init_vars(self, threshold=0.001):
+        if threshold == 0:
+            self.selected = None
         self.threshold = threshold
         self.drop_columns = []
 
@@ -622,6 +723,8 @@ class FeatureImportance(Primitive):
     supported_ops = ('del',)
 
     def init_vars(self, threshold=0.001, task_type='classification'):
+        if threshold == 0:
+            self.selected = None
         self.threshold = threshold
         self.drop_columns = []
         self.task_type = task_type

From d5b7f9ca0345eb161c217247355dc00fa06bfde8 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Fri, 20 Sep 2019 06:52:38 -0500
Subject: [PATCH 27/31] 2 rounds search

---
 autokaggle/auto_ml.py    | 31 ++++++++++++++----
 autokaggle/config.py     | 71 ++++++++++++++++++++++++++++++++++++++--
 autokaggle/ensemblers.py |  4 +--
 3 files changed, 94 insertions(+), 12 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index c7c021a..6c61da3 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -10,7 +10,8 @@
 from autokaggle.preprocessor import TabularPreprocessor
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 from lightgbm import LGBMClassifier, LGBMRegressor
-from autokaggle.config import Config, classification_hspace, regression_hspace
+from autokaggle.config import Config, classification_hspace, regression_hspace, classification_hspace_base,\
+    regression_hspace_base, regression_p_hspace_base, classification_p_hspace_base
 from sklearn.model_selection import StratifiedKFold, KFold
 import hyperopt
 from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL
@@ -23,7 +24,9 @@
 # TODO: Further clean the design of this file
 class AutoKaggle(BaseEstimator):
     pipeline = None
-    hparams = None
+    m_hparams = None
+    m_hparams_base = None
+    p_hparams_base = None
 
     def __init__(self, config=None, **kwargs):
         """
@@ -72,8 +75,13 @@ def fit(self, x, y, time_limit=None, data_info=None):
             self.config.objective = 'binary' if n_classes == 2 else 'multiclass'
 
         # self.pipeline = AutoPipe(LGBMClassifier, {}, {}, self.config)
-        prep_space = {'prep': hp.choice('data_source', ['a', 'b'])}
-        self.pipeline = self.get_best_pipeline(self.search(x, y, prep_space, self.hparams))
+        # Search the top preprocessing setting
+        trials = self.search(x, y, self.p_hparams_base, self.m_hparams_base)
+        p_hparams = self.get_top_prep(trials)
+        # Search the best pipelines
+        trials = self.search(x, y, p_hparams, self.m_hparams_base)
+        self.pipeline = self.get_best_pipeline(trials)
+        # Fit data
         self.pipeline.fit(x, y)
         self.is_trained = True
 
@@ -153,7 +161,7 @@ def objective_func(params):
 
         trials = Trials()
         search_space = {'prep': prep_space, 'estimator': model_space}
-        _ = fmin(objective_func, search_space, algo=hyperopt.rand.suggest, trials=trials,
+        _ = fmin(objective_func, search_space, algo=self.config.ensembling_algo, trials=trials,
                  max_evals=self.config.search_iter)
         return trials
 
@@ -168,6 +176,9 @@ def get_best_pipeline(self, trials):
                 print(opt)
         return best_pipeline
 
+    def get_top_prep(self, trials):
+        return hp.choice('p_params', [res['p_params'] for res in trials.results])
+
     @abstractmethod
     def get_skf(self, folds):
         pass
@@ -236,7 +247,10 @@ class AutoKaggleClassifier(AutoKaggle):
     def __init__(self, config=None, **kwargs):
         super().__init__(config, **kwargs)
         self.config.objective = 'classification'
-        self.hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models])
+        self.m_hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models])
+        self.m_hparams_base = hp.choice('classifier',
+                                     [classification_hspace_base[m] for m in self.config.classification_models])
+        self.p_hparams_base = classification_p_hspace_base
 
     def get_skf(self, folds):
         if self.config.objective == 'binary':
@@ -252,7 +266,10 @@ class AutoKaggleRegressor(AutoKaggle):
     def __init__(self, config=None, **kwargs):
         super().__init__(config, **kwargs)
         self.config.objective = 'regression'
-        self.hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models])
+        self.m_hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models])
+        self.m_hparams_base = hp.choice('regressor',
+                                     [regression_hspace_base[m] for m in self.config.classification_models])
+        self.p_hparams_base = regression_p_hspace_base
 
     def get_skf(self, folds):
         return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
diff --git a/autokaggle/config.py b/autokaggle/config.py
index 4e780ff..8855251 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -1,5 +1,6 @@
 from sklearn.base import BaseEstimator
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir
+import hyperopt
 from hyperopt import hp
 from sklearn.svm import SVC
 from sklearn.neighbors import KNeighborsClassifier
@@ -12,10 +13,10 @@
 
 
 class Config:
-    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
-                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3,
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=50,
+                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3,
                  subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False,
-                 data_info=None, balance_class_dist=False):
+                 data_info=None, balance_class_dist=False, ensembling_search_iter=10, ensembling_algo='random'):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
         ensure_dir(self.path)
@@ -41,6 +42,8 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True
         self.stack_probabilities = stack_probabilities
         self.data_info = data_info
         self.balance_class_dist = balance_class_dist
+        self.ensembling_search_iter = ensembling_search_iter
+        self.ensembling_algo = hyperopt.rand.suggest if ensembling_algo == 'random' else hyperopt.tpe.suggest
 
     def update(self, options):
         for k, v in options.items():
@@ -205,4 +208,66 @@ def update(self, options):
         'model': CatBoostClassifier,
         'param': catboost_classifier_params
     }
+}
+
+classification_hspace_base = {
+    'knn': {
+        'model': KNeighborsClassifier,
+        'param': {}
+    },
+    'svm': {
+        'model': SVC,
+        'param': {}
+    },
+    'random_forest': {
+        'model': RandomForestClassifier,
+        'param': {}
+    },
+    'lgbm': {
+        'model': LGBMClassifier,
+        'param': {}
+    },
+    'adaboost': {
+        'model': AdaBoostClassifier,
+        'param': {}
+    },
+    'catboost': {
+        'model': CatBoostClassifier,
+        'param': {}
+    }
+}
+
+regression_hspace_base = {
+    'extratree': {
+        'model': ExtraTreesRegressor,
+        'param': {}
+    },
+    'ridge': {
+        'model': Ridge,
+        'param': {}
+    },
+    'random_forest': {
+        'model': RandomForestRegressor,
+        'param': {}
+    },
+    'lgbm': {
+        'model': LGBMRegressor,
+        'param': {}
+    },
+    'adaboost': {
+        'model': AdaBoostRegressor,
+        'param': {}
+     },
+    'catboost': {
+        'model': CatBoostRegressor,
+        'param': {}
+    }
+}
+
+regression_p_hspace_base = {
+
+}
+
+classification_p_hspace_base = {
+
 }
\ No newline at end of file
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index 68cafa7..30e411f 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -112,8 +112,8 @@ def objective_func(args):
             return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
 
         trials = Trials()
-        best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials,
-                    max_evals=50)
+        best = fmin(objective_func, self.hparams, algo=self.config.ensembling_algo, trials=trials,
+                    max_evals=self.config.ensembling_search_iter)
 
         opt = space_eval(self.hparams, best)
         best_estimator_ = opt['model'](**opt['param'])

From 6d25537d4f0cc50f2013017d4f4f933730ac0eff Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Fri, 20 Sep 2019 09:12:49 -0500
Subject: [PATCH 28/31] Select best preprocessing settings

---
 autokaggle/auto_ml.py    | 17 ++++++++++++++---
 autokaggle/config.py     |  8 +++++---
 examples/benchmarking.py |  3 +--
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 6c61da3..978682b 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -77,7 +77,7 @@ def fit(self, x, y, time_limit=None, data_info=None):
         # self.pipeline = AutoPipe(LGBMClassifier, {}, {}, self.config)
         # Search the top preprocessing setting
         trials = self.search(x, y, self.p_hparams_base, self.m_hparams_base)
-        p_hparams = self.get_top_prep(trials)
+        p_hparams = self.get_top_prep(trials, self.config.num_p_hparams)
         # Search the best pipelines
         trials = self.search(x, y, p_hparams, self.m_hparams_base)
         self.pipeline = self.get_best_pipeline(trials)
@@ -176,8 +176,19 @@ def get_best_pipeline(self, trials):
                 print(opt)
         return best_pipeline
 
-    def get_top_prep(self, trials):
-        return hp.choice('p_params', [res['p_params'] for res in trials.results])
+    @staticmethod
+    def get_top_prep(trials, n):
+        best_trials = [t for t in trials.results if t['loss'] != float('inf')]
+        best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
+        top_p_hparams, count = [], 0
+        for trial in best_trials:
+            if trial['p_params'] not in top_p_hparams:
+                top_p_hparams.append(trial)
+                count += 1
+                if count > n:
+                    break
+
+        return hp.choice('p_params', top_p_hparams)
 
     @abstractmethod
     def get_skf(self, folds):
diff --git a/autokaggle/config.py b/autokaggle/config.py
index 8855251..610e84b 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -13,10 +13,11 @@
 
 
 class Config:
-    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=50,
-                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3,
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
+                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3,
                  subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False,
-                 data_info=None, balance_class_dist=False, ensembling_search_iter=10, ensembling_algo='random'):
+                 data_info=None, balance_class_dist=False, ensembling_search_iter=10, ensembling_algo='random',
+                 num_p_hparams=10):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
         ensure_dir(self.path)
@@ -44,6 +45,7 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=Fals
         self.balance_class_dist = balance_class_dist
         self.ensembling_search_iter = ensembling_search_iter
         self.ensembling_algo = hyperopt.rand.suggest if ensembling_algo == 'random' else hyperopt.tpe.suggest
+        self.num_p_hparams = num_p_hparams
 
     def update(self, options):
         for k, v in options.items():
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 50dab23..604d92e 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -280,14 +280,13 @@ def get_box_plot(data, task_id, metric):
 
 if __name__ == "__main__":
     regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839]
-    # classification_task_list = [3021]
     classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101,
                                 9914, 3020, 3524, 3573, 3962]
     ak = BenchmarkingAutoKaggle(sess_name='test_perf')
     import time
 
     # t1 = time.time()
-    # for _ in range(3):
+    # for _ in range(1):
     #     ak.run_automation(classification_task_list)
     # t2 = time.time()
     # print(t2-t1)

From 8e0068b868a1af117ff4f29d049a902ade23fd33 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Fri, 20 Sep 2019 09:31:03 -0500
Subject: [PATCH 29/31] prep param space update

---
 autokaggle/config.py       | 30 +++++++++++++++++++++++++++---
 autokaggle/preprocessor.py |  3 +--
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/autokaggle/config.py b/autokaggle/config.py
index 610e84b..89be536 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -267,9 +267,33 @@ def update(self, options):
 }
 
 regression_p_hspace_base = {
-
+    'cat_encoding': hp.choice('cat_enc', ['count', 'target+count', 'target+label', 'label']),
+    'scaling': hp.choice('scaling', [True, False]),
+    'log_transform': hp.choice('log_transform', [True, False]),
+    'power_transform': hp.choice('power_transform', [True, False]),
+    'pca': hp.choice('pca', [True, False]),
+    'binning': hp.choice('binning', [True, False]),
+    'add_time_offset': hp.choice('add_time_offset', [True, False]),
+    'add_time_diff': hp.choice('add_time_diff', [True, False]),
+    # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', 'min', None]),
+    # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]),
+    'imputation_strategy': hp.choice('imputation_strategy', ['most_frequent', 'zero']),
+    'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01),
+    'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
 }
 
 classification_p_hspace_base = {
-
-}
\ No newline at end of file
+    'cat_encoding': hp.choice('cat_enc', ['target', 'count', 'target+count', 'target+label']),
+    'scaling': hp.choice('scaling', [True, False]),
+    'log_transform': hp.choice('log_transform', [True, False]),
+    'power_transform': hp.choice('power_transform', [True, False]),
+    'pca': hp.choice('pca', [True, False]),
+    'binning': hp.choice('binning', [True, False]),
+    'add_time_offset': hp.choice('add_time_offset', [True, False]),
+    'add_time_diff': hp.choice('add_time_diff', [True, False]),
+    # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', 'min', None]),
+    # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]),
+    'imputation_strategy': hp.choice('imputation_strategy', ['most_frequent', 'zero']),
+    'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01),
+    'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
+}
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 258374a..4b789d6 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -382,8 +382,7 @@ def _fit(self, data, y=None):
         return self
 
     def _transform(self, data, y=None):
-        x_tr = self.binner.transform(data.X[self.selected])
-        # TODO: decide if cat or num new type
+        x_tr = pd.DataFrame(self.binner.transform(data.X[self.selected]))
         data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
         return data
 

From c29f15947c7bbbb5974c27ee81a9f74794103b74 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Tue, 24 Sep 2019 19:41:29 -0500
Subject: [PATCH 30/31] Address review comments

---
 autokaggle/__init__.py                        |   2 +-
 autokaggle/auto_ml.py                         | 343 ++++++++++---
 autokaggle/config.py                          |  91 ++--
 autokaggle/ensemblers.py                      | 182 +++----
 autokaggle/preprocessor.py                    | 468 ++++++++++++++++--
 examples/benchmarking.py                      | 167 ++++++-
 examples/tabular_classification_binary.py     |   2 +-
 examples/tabular_classification_multiclass.py |   2 +-
 examples/tabular_regression.py                |   2 +-
 9 files changed, 981 insertions(+), 278 deletions(-)

diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py
index 727b31e..6744a7a 100644
--- a/autokaggle/__init__.py
+++ b/autokaggle/__init__.py
@@ -1,2 +1,2 @@
-from autokaggle.auto_ml import AutoKaggleClassifier, AutoKaggleRegressor
+from autokaggle.auto_ml import Classifier, Regressor
 from autokaggle.ensemblers import *
diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 978682b..4c2dc47 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -7,48 +7,79 @@
 from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
 from joblib import dump, load
 
-from autokaggle.preprocessor import TabularPreprocessor
+from autokaggle.preprocessor import Preprocessor
 from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
 from lightgbm import LGBMClassifier, LGBMRegressor
-from autokaggle.config import Config, classification_hspace, regression_hspace, classification_hspace_base,\
-    regression_hspace_base, regression_p_hspace_base, classification_p_hspace_base
+from autokaggle.config import Config, CLASSIFICATION_PREP_HPARAM_SPACE, REGRESSION_PREP_HPARAM_SPACE, \
+    REGRESSION_BASE_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, REGRESSION_HPARAM_SPACE
 from sklearn.model_selection import StratifiedKFold, KFold
 import hyperopt
 from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL
 from sklearn.model_selection import cross_val_score
-from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler
+from autokaggle.ensemblers import RankedEnsemblingModel, StackedEnsemblingModel
 from imblearn.over_sampling import SMOTE, SMOTENC
 import collections
 
 
-# TODO: Further clean the design of this file
 class AutoKaggle(BaseEstimator):
-    pipeline = None
-    m_hparams = None
-    m_hparams_base = None
-    p_hparams_base = None
-
-    def __init__(self, config=None, **kwargs):
-        """
-        Initialization function for tabular supervised learner.
-        """
+    """ Automated Machine Learning system class.
+
+        AutoKaggle implements an end to end automated ML system. It initiates and searches for the optimum ML pipeline.
+        The user can use it with the simple `fit()` and  `predict()` methods like Sci-kit learn estimators.
+        The user can specify various parameters controlling different components of the system.
+        # Arguments
+            path: String. OS path for storing temporary model parameters.
+            verbose: Bool. Defines the verbosity of the logging.
+            time_limit: Int. Time budget for performing search and fit pipeline.
+            use_ensembling: Bool. Defines whether to use an ensemble of models
+            num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble
+            ensemble_strategy: String. Strategy to ensemble models
+            ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are picked.
+            ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search
+            search_algo: String. Search strategy for hyper-parameter search.
+            search_iter: Int. Number of iterations used for hyper-parameter search.
+            cv_folds: Int. Number of Cross Validation folds.
+            subsample_ratio: Percent of subsample used for for hyper-parameter search.
+            data_info: list(String). Lists the datatypes of each feature column.
+            stack_probabilities: Bool. Whether to use class probabilities in ensembling.
+            upsample_classes: Bool. Whether to upsample less represented classes
+            num_p_hparams: Int. Number of preprocessor search spaces.
+    """
+
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10,
+                 search_algo='random', num_p_hparams=10):
         self.is_trained = False
-        self.config = config if config else Config()
-        self.config.update(kwargs)
-        if not self.config.path:
-            self.config.path = rand_temp_folder_generator()
+        if not path:
+            path = rand_temp_folder_generator()
+        self.config = Config(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling,
+                             num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy,
+                             ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds,
+                             subsample_ratio=subsample_ratio, random_ensemble=random_ensemble,
+                             diverse_ensemble=diverse_ensemble, stack_probabilities=stack_probabilities,
+                             data_info=data_info, upsample_classes=upsample_classes,
+                             ensembling_search_iter=ensembling_search_iter, search_algo=search_algo,
+                             num_p_hparams=num_p_hparams)
+        self.pipeline = None
+        self.m_hparams = None
+        self.m_hparams_base = None
+        self.p_hparams_base = None
 
     def fit(self, x, y, time_limit=None, data_info=None):
-        """
-        This function should train the model parameters.
-        Args:
+        """ Train an autoML system.
+        # Arguments
             x: A numpy.ndarray instance containing the training data.
             y: training label vector.
             time_limit: remaining time budget.
-            data_info: meta-features of the dataset, which is an numpy.ndarray describing the
-             feature type of each column in raw_x. The feature type include:
-                     'TIME' for temporal feature, 'NUM' for other numerical feature,
-                     and 'CAT' for categorical feature.
+            data_info: meta-features of the dataset, which is an numpy.ndarray describing the feature type of each
+             column in raw_x. The feature type include: 'TIME' for temporal feature, 'NUM' for other numerical feature,
+             and 'CAT' for categorical feature.
+        # Returns
+            None
         Both inputs X and y are numpy arrays.
         If fit is called multiple times on incremental data (train, test1, test2, etc.)
         you should warm-start your training from the pre-trained model. Past data will
@@ -64,7 +95,7 @@ def fit(self, x, y, time_limit=None, data_info=None):
             print('#TIME features: {}'.format(sum(self.config.data_info == 'TIME')))
             print('#NUM features: {}'.format(sum(self.config.data_info == 'NUM')))
             print('#CAT features: {}'.format(sum(self.config.data_info == 'CAT')))
-        
+
         if x.shape[1] == 0:
             raise ValueError("No feature exist!")
 
@@ -86,9 +117,12 @@ def fit(self, x, y, time_limit=None, data_info=None):
         self.is_trained = True
 
     def predict(self, x_test):
-        """
-        This function should provide predictions of labels on (test) data.
-        The function predict eventually can return probabilities or continuous values.
+        """ Generate prediction on the test data for the given task.
+        # Arguments
+            x_test: A numpy.ndarray instance containing the test data.
+        # Returns
+            A numpy array for the predictions on the x_test.
+        This function provides predictions of labels on (test) data.
         """
         y = self.pipeline.predict(x_test)
         if y is None:
@@ -96,12 +130,26 @@ def predict(self, x_test):
         return y
 
     def predict_proba(self, x_test):
+        """ Predict label probabilities on the test data for the given classification task.
+        # Arguments
+            x_test: A numpy.ndarray instance containing the test data.
+        # Returns
+            A numpy array for the prediction probabilities on the x_test.
+        The function returns predicted probabilities for every class label.
+        """
         y = self.pipeline.predict_proba(x_test)
         if y is None:
             raise ValueError("Tabular predictor does not exist")
         return y
 
     def evaluate(self, x_test, y_test):
+        """ Predict label probabilities on the test data for the given classification task.
+        # Arguments
+            x_test: A numpy.ndarray instance containing the training data.
+            y_test: A numpy array with ground truth labels for the test data
+        # Returns
+            An evaluation score based on the task type.
+        """
         if self.config.verbose:
             print('objective:', self.config.objective)
         y_pred = self.predict(x_test)
@@ -114,11 +162,15 @@ def evaluate(self, x_test, y_test):
             results = mean_squared_error(y_test, y_pred)
         return results
 
-    def final_fit(self, x_train, y_train):
-        self.pipeline.fit(x_train, y_train)
-
     def resample(self, x, y):
-        if self.config.balance_class_dist:
+        """ Up-samples the input data
+        # Arguments
+            x: A numpy array for features
+            y: A numpy array for target
+        # Returns
+            Up-sampled version of the dataset
+        """
+        if self.config.upsample_classes:
             x, y = SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(x, y)
         while x.shape[0] < 60:
             x = np.concatenate([x, x], axis=0)
@@ -126,6 +178,14 @@ def resample(self, x, y):
         return x, y
 
     def subsample(self, x, y, sample_percent):
+        """ Takes a sub-sample of the input data, for the hyper-parameter search.
+        # Arguments
+            x: A numpy array for features
+            y: A numpy array for target
+            sample_percent: Minimum percentage of the  data to be maintained
+        # Returns
+            Down-sampled dataset
+        """
         # TODO: Add way to balance the subsample
         # Set small sample for hyper-param search
         if x.shape[0] > 600:
@@ -138,6 +198,15 @@ def subsample(self, x, y, sample_percent):
         return grid_train_x, grid_train_y
 
     def search(self, x, y, prep_space, model_space):
+        """ Do hyper-parameter search to find optimal machine learning pipeline.
+        # Arguments
+            x: A numpy array for features
+            y: A numpy array for target
+            prep_space: Hyper-parameter search space for preprocessors
+            model_space: Hyper-parameter search space for estimators
+        # Returns
+            List of hyper-parameter trials
+        """
         grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio)
         score_metric, skf = self.get_skf(self.config.cv_folds)
 
@@ -149,7 +218,8 @@ def objective_func(params):
             try:
                 eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
                 status = STATUS_OK
-            except ValueError:
+            except ValueError as e:
+                print(e)
                 eval_score = float('-inf')
                 status = STATUS_FAIL
             if self.config.verbose:
@@ -161,11 +231,17 @@ def objective_func(params):
 
         trials = Trials()
         search_space = {'prep': prep_space, 'estimator': model_space}
-        _ = fmin(objective_func, search_space, algo=self.config.ensembling_algo, trials=trials,
-                 max_evals=self.config.search_iter)
+        _ = fmin(objective_func, search_space, algo=self.config.search_algo, trials=trials,
+                 max_evals=self.config.search_iter, rstate=np.random.RandomState(self.config.random_state))
         return trials
 
     def get_best_pipeline(self, trials):
+        """ Finds the optimal pipeline from the given list of search trials.
+        # Arguments
+            trials: List of hyper-parameter search trials
+        # Returns
+            Optimal pipeline based on the given list of trials
+        """
         if self.config.use_ensembling:
             best_pipeline = self.setup_ensemble(trials)
         else:
@@ -178,6 +254,13 @@ def get_best_pipeline(self, trials):
 
     @staticmethod
     def get_top_prep(trials, n):
+        """ Find the list of top N preprocessor settings.
+        # Arguments
+            trials: List of hyper-parameter search trials
+            n: Maximum number of preprocessor settings required
+        # Returns
+            List of the top N optimal preprocessor settings.
+        """
         best_trials = [t for t in trials.results if t['loss'] != float('inf')]
         best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
         top_p_hparams, count = [], 0
@@ -192,16 +275,28 @@ def get_top_prep(trials, n):
 
     @abstractmethod
     def get_skf(self, folds):
+        """ Get the scoring metric and the cross validation folds for evaluation.
+        # Arguments
+            folds: NUmber of cross validation folds
+        # Returns
+            Scoring metric and cross validation folds.
+        """
         pass
 
-    def pick_diverse_estimators(self, trial_list, k):
+    def pick_diverse_estimators(self, trial_list):
+        """ Selects the best hyper-parameter settings from each estimator family.
+        # Arguments
+            trial_list: List of the hyper-parameter search trials.
+        # Returns
+            List of top hyper-parameter spaces equally selected from each estimator family.
+        """
         groups = collections.defaultdict(list)
 
         for obj in trial_list:
             groups[obj['model_class']].append(obj)
         estimator_list = []
         idx, j = 0, 0
-        while idx < k:
+        while idx < self.config.num_estimators_ensemble:
             for grp in groups.values():
                 if j < len(grp):
                     est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'], grp[j]['p_params'], self.config)
@@ -211,6 +306,12 @@ def pick_diverse_estimators(self, trial_list, k):
         return estimator_list
 
     def setup_ensemble(self, trials):
+        """ Generates the optimal ensembling estimator based on the given setting.
+        # Arguments
+            trials: List of the hyper-parameter search trials.
+        # Returns
+            An ensembling estimator to be trained using the base estimators picked from trials.
+        """
         # Filter the unsuccessful hparam spaces i.e. 'loss' == float('inf')
         best_trials = [t for t in trials.results if t['loss'] != float('inf')]
         best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
@@ -221,7 +322,7 @@ def setup_ensemble(self, trials):
             np.random.shuffle(best_trials)
 
         if self.config.diverse_ensemble:
-            estimator_list = self.pick_diverse_estimators(best_trials, self.config.num_estimators_ensemble)
+            estimator_list = self.pick_diverse_estimators(best_trials)
         else:
             estimator_list = []
             for i in range(self.config.num_estimators_ensemble):
@@ -230,18 +331,19 @@ def setup_ensemble(self, trials):
                 estimator_list.append(est)
 
         if self.config.ensemble_strategy == 'stacking':
-            best_estimator_ = StackingEnsembler(estimator_list, config=self.config)
+            best_estimator_ = StackedEnsemblingModel(estimator_list, config=self.config)
         else:
-            best_estimator_ = RankedEnsembler(estimator_list, config=self.config)
+            best_estimator_ = RankedEnsemblingModel(estimator_list, config=self.config)
         return best_estimator_
 
     @staticmethod
     def extract_data_info(raw_x):
         """
-        This function extracts the data info automatically based on the type of each feature in raw_x.
-
-        Args:
+        Extracts the data info automatically based on the type of each feature in raw_x.
+        # Arguments
             raw_x: a numpy.ndarray instance containing the training data.
+        # Returns
+            A list of data-types for each feature in the data.
         """
         data_info = []
         row_num, col_num = raw_x.shape
@@ -254,16 +356,54 @@ def extract_data_info(raw_x):
         return np.array(data_info)
 
 
-class AutoKaggleClassifier(AutoKaggle):
-    def __init__(self, config=None, **kwargs):
-        super().__init__(config, **kwargs)
+class Classifier(AutoKaggle):
+    """ Extends AutoKaggle for Classification.
+
+        Extends the AutoKaggle specific to the classification requirements.
+        # Arguments
+            path: String. OS path for storing temporary model parameters.
+            verbose: Bool. Defines the verbosity of the logging.
+            time_limit: Int. Time budget for performing search and fit pipeline.
+            use_ensembling: Bool. Defines whether to use an ensemble of models
+            num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble
+            ensemble_strategy: String. Strategy to ensemble models
+            ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are picked.
+            ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search
+            search_algo: String. Search strategy for hyper-parameter search.
+            search_iter: Int. Number of iterations used for hyper-parameter search.
+            cv_folds: Int. Number of Cross Validation folds.
+            subsample_ratio: Percent of subsample used for for hyper-parameter search.
+            data_info: list(String). Lists the datatypes of each feature column.
+            stack_probabilities: Bool. Whether to use class probabilities in ensembling.
+            upsample_classes: Bool. Whether to upsample less represented classes
+            num_p_hparams: Int. Number of preprocessor search spaces.
+    """
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10,
+                 search_algo='random', num_p_hparams=10):
+        super().__init__(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling,
+                         num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy,
+                         ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds,
+                         subsample_ratio=subsample_ratio, random_ensemble=random_ensemble, diverse_ensemble=diverse_ensemble,
+                         stack_probabilities=stack_probabilities, data_info=data_info,
+                         upsample_classes=upsample_classes, ensembling_search_iter=ensembling_search_iter,
+                         search_algo=search_algo, num_p_hparams=num_p_hparams)
         self.config.objective = 'classification'
-        self.m_hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models])
+        self.m_hparams = hp.choice('classifier', [CLASSIFICATION_HPARAM_SPACE[m] for m in
+                                                  self.config.classification_models])
         self.m_hparams_base = hp.choice('classifier',
-                                     [classification_hspace_base[m] for m in self.config.classification_models])
-        self.p_hparams_base = classification_p_hspace_base
+                                        [CLASSIFICATION_BASE_HPARAM_SPACE[m] for m in
+                                         self.config.classification_models])
+        self.p_hparams_base = CLASSIFICATION_PREP_HPARAM_SPACE
 
     def get_skf(self, folds):
+        """
+            See the base class.
+        """
         if self.config.objective == 'binary':
             score_metric = 'roc_auc'
             skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
@@ -273,28 +413,75 @@ def get_skf(self, folds):
         return score_metric, skf
 
 
-class AutoKaggleRegressor(AutoKaggle):
-    def __init__(self, config=None, **kwargs):
-        super().__init__(config, **kwargs)
+class Regressor(AutoKaggle):
+    """ Extends AutoKaggle for Regression
+
+        Extends the AutoKaggle specific to the regression requirements.
+        # Arguments
+            path: String. OS path for storing temporary model parameters.
+            verbose: Bool. Defines the verbosity of the logging.
+            time_limit: Int. Time budget for performing search and fit pipeline.
+            use_ensembling: Bool. Defines whether to use an ensemble of models
+            num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble
+            ensemble_strategy: String. Strategy to ensemble models
+            ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are picked.
+            ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search
+            search_algo: String. Search strategy for hyper-parameter search.
+            search_iter: Int. Number of iterations used for hyper-parameter search.
+            cv_folds: Int. Number of Cross Validation folds.
+            subsample_ratio: Percent of subsample used for for hyper-parameter search.
+            data_info: list(String). Lists the datatypes of each feature column.
+            stack_probabilities: Bool. Whether to use class probabilities in ensembling.
+            upsample_classes: Bool. Whether to upsample less represented classes
+            num_p_hparams: Int. Number of preprocessor search spaces.
+    """
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10,
+                 search_algo='random', num_p_hparams=10):
+        super().__init__(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling,
+                         num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy,
+                         ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds,
+                         subsample_ratio=subsample_ratio, random_ensemble=random_ensemble,
+                         diverse_ensemble=diverse_ensemble,
+                         stack_probabilities=stack_probabilities, data_info=data_info,
+                         upsample_classes=upsample_classes, ensembling_search_iter=ensembling_search_iter,
+                         search_algo=search_algo, num_p_hparams=num_p_hparams)
         self.config.objective = 'regression'
-        self.m_hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models])
+        self.m_hparams = hp.choice('regressor', [REGRESSION_HPARAM_SPACE[m] for m in self.config.regression_models])
         self.m_hparams_base = hp.choice('regressor',
-                                     [regression_hspace_base[m] for m in self.config.classification_models])
-        self.p_hparams_base = regression_p_hspace_base
+                                        [REGRESSION_BASE_HPARAM_SPACE[m] for m in self.config.classification_models])
+        self.p_hparams_base = REGRESSION_PREP_HPARAM_SPACE
 
     def get_skf(self, folds):
+        """
+            See the base class.
+        """
         return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
 
 
 class AutoPipe(BaseEstimator):
-    prep = None
-    model = None
-    config = None
-    m_params = None
-    p_params = None
-    model_class = None
-
+    """ Implements a machine learning pipeline.
+
+        Implements a machine learning pipeline with preprocessor and estimator. A user can call fit(), and predict()
+        methods on it. It is used as a search unit in AutoKaggle's hyeper-parameter search.
+        # Arguments
+            config: Config. Defines the configuration of various components of the pipeline.
+            m_params: Dict. Hyper-parameter search space for estimator.
+            p_params: Dict. Hyper-parameter search space for preprocessor.
+            model_class: Estimator. Class name of the estimator used in the pipeline.
+            _estimator_type: String. Denotes if the estimator is 'classifier' or 'regressor'
+            prep: Preprocessor. Instance of the Preprocessor class, which does basic feature preprocessing and feature
+            engineering
+            model: Estimator. Instance of the estimator class which learns a machine learning model and predicts on the
+            given data.
+    """
     def __init__(self, model_class, m_params, p_params, config):
+        self.prep = None
+        self.model = None
         self.config = config
         self.m_params = m_params
         self.p_params = p_params
@@ -302,16 +489,37 @@ def __init__(self, model_class, m_params, p_params, config):
         self._estimator_type = 'classifier' if is_classifier(model_class) else 'regressor'
 
     def fit(self, x, y):
-        self.prep = TabularPreprocessor(self.config, self.p_params)
+        """ Trains the given pipeline.
+        # Arguments
+            x: A numpy.ndarray instance containing the training data.
+            y: training label vector.
+        # Returns
+            None
+        """
+        self.prep = Preprocessor(self.config, self.p_params)
         self.model = self.model_class(**self.m_params)
         x = self.prep.fit_transform(x, y)
         self.model.fit(x, y)
 
     def predict(self, x):
+        """ Generate prediction on the test data for the given task.
+        # Arguments
+            x: A numpy.ndarray instance containing the test data.
+        # Returns
+            A numpy array for the predictions on the x.
+        This function provides predictions of labels on (test) data.
+        """
         x = self.prep.transform(x)
         return self.model.predict(x)
 
     def predict_proba(self, x):
+        """ Predict label probabilities on the test data for the given classification task.
+        # Arguments
+            x: A numpy.ndarray instance containing the test data.
+        # Returns
+            A numpy array for the prediction probabilities on the x.
+        The function returns predicted probabilities for every class label.
+        """
         x = self.prep.transform(x)
         try:
             return self.model.predict_proba(x)
@@ -319,6 +527,13 @@ def predict_proba(self, x):
             return self.model.predict(x)
 
     def decision_function(self, x):
+        """ Returns the decision function learned by the estimator.
+        # Arguments
+            x: A numpy.ndarray instance containing the test data.
+        # Returns
+            Decision function learned by the estimator.
+        This is used by the scorers to evaluate the pipeline.
+        """
         x = self.prep.transform(x)
         try:
             return self.model.decision_function(x)
diff --git a/autokaggle/config.py b/autokaggle/config.py
index 89be536..87a4f30 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -13,10 +13,35 @@
 
 
 class Config:
+    """ Configuration for various autoML components.
+
+        Defines the common configuration of different auto ML components. It is shared between AutoKaggle, AutoPipe,
+        Preprocessor and Ensembling class.
+
+        # Arguments
+            path: String. OS path for storing temporary model parameters.
+            verbose: Bool. Defines the verbosity of the logging.
+            time_limit: Int. Time budget for performing search and fit pipeline.
+            use_ensembling: Bool. Defines whether to use an ensemble of models
+            num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble
+            ensemble_strategy: String. Strategy to ensemble models
+            ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are picked.
+            ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search
+            search_algo: String. Search strategy for hyper-parameter search.
+            search_iter: Int. Number of iterations used for hyper-parameter search.
+            cv_folds: Int. Number of Cross Validation folds.
+            subsample_ratio: Percent of subsample used for for hyper-parameter search.
+            data_info: list(String). Lists the datatypes of each feature column.
+            stack_probabilities: Bool. Whether to use class probabilities in ensembling.
+            upsample_classes: Bool. Whether to upsample less represented classes
+            num_p_hparams: Int. Number of preprocessor search spaces.
+    """
     def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
                  ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3,
                  subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False,
-                 data_info=None, balance_class_dist=False, ensembling_search_iter=10, ensembling_algo='random',
+                 data_info=None, upsample_classes=False, ensembling_search_iter=10, search_algo='random',
                  num_p_hparams=10):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
@@ -42,9 +67,9 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True
         self.diverse_ensemble = diverse_ensemble
         self.stack_probabilities = stack_probabilities
         self.data_info = data_info
-        self.balance_class_dist = balance_class_dist
+        self.upsample_classes = upsample_classes
         self.ensembling_search_iter = ensembling_search_iter
-        self.ensembling_algo = hyperopt.rand.suggest if ensembling_algo == 'random' else hyperopt.tpe.suggest
+        self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else hyperopt.tpe.suggest
         self.num_p_hparams = num_p_hparams
 
     def update(self, options):
@@ -53,14 +78,14 @@ def update(self, options):
                 setattr(self, k, v)
 
 
-knn_classifier_params = {
+KNN_CLASSIFIER_PARAMS = {
     'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]),
     'weights': hp.choice('weight_knn', ['uniform', 'distance']),
     'metric': hp.choice('metric_knn', ["euclidean", "manhattan", "chebyshev", "minkowski"]),
     'p': hp.choice('p_knn', range(1, 3)),
 }
 
-svc_params = {
+SVM_CLASSIFIER_PARAMS = {
     'C': hp.loguniform('C_svm', np.log(0.03125), np.log(32768)),
     'kernel': hp.choice('kernel_svm', ['rbf', 'poly', 'sigmoid']),
     'degree': hp.choice('degree_svm', range(2, 6)),
@@ -68,7 +93,7 @@ def update(self, options):
     'max_iter': 50000,
 }
 
-random_forest_classifier_params = {
+RANDOM_FOREST_CLASSIFIER_PARAMS = {
     'criterion': hp.choice('criterion_rf', ['entropy', 'gini']),
     'max_features': hp.uniform('max_features_rf', 0, 1.0),
     'n_estimators': hp.choice('n_estimators_rf', [100, 50]),
@@ -76,7 +101,7 @@ def update(self, options):
     'min_samples_split': hp.choice('min_samples_split_rf', range(2, 20)),
 }
 
-lgbm_classifier_params = {
+LGBM_CLASSIFIER_PARAMS = {
     'boosting_type': 'gbdt',
     'min_split_gain': 0.1,
     'subsample': 0.8,
@@ -88,13 +113,13 @@ def update(self, options):
     'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2), high=np.log(2)),
 }
 
-adaboost_classifier_params = {
+ADABOOST_CLASSIFIER_PARAMS = {
     'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
     'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)),
     'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)),
 }
 
-catboost_classifier_params = {
+CATBOOST_CLASSIFIER_PARAMS = {
     'iterations': hp.choice('iterations_catboost', [5, 10]),
     'depth': hp.choice('depth_catboost', range(4, 11)),
     'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), high=np.log(1)),
@@ -104,7 +129,7 @@ def update(self, options):
     'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3))
 }
 
-extra_trees_regressor_params = {
+EXTRA_TREES_REGRESSOR_PARAMS = {
     'n_estimators': hp.choice('n_estimators_extra_trees', [50, 100, 200]),
     'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']),
     'max_features': hp.uniform('max_features_extra_trees', 0, 1.0),
@@ -114,13 +139,13 @@ def update(self, options):
     'bootstrap': hp.choice('bootstrap_extra_trees', [True, False]),
 }
 
-ridge_params = {
+RIDGE_REGRESSOR_PARAMS = {
     'fit_intercept': True,
     'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1),
     'alpha': hp.loguniform('alpha_ridge', np.log(1e-5), np.log(10))
 }
 
-random_forest_regressor_params = {
+RANDOM_FOREST_REGRESSOR_PARAMS = {
     'criterion': hp.choice('criterion_rf', ['mse', 'friedman_mse', 'mae']),
     'max_features': hp.uniform('max_features_rf', 0.1, 1.0),
     'n_estimators': hp.choice('n_estimators_rf', [50, 100, 200]),
@@ -129,7 +154,7 @@ def update(self, options):
     'bootstrap': hp.choice('bootstrap_rf', [True, False]),
 }
 
-lgbm_regressor_params = {
+LGBM_REGRESSOR_PARAMS = {
     'boosting_type': 'gbdt',
     'min_split_gain': 0.1,
     'subsample': 0.8,
@@ -141,14 +166,14 @@ def update(self, options):
     'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5), high=np.log(1)),
 }
 
-adaboost_regressor_params = {
+ADABOOST_REGRESSOR_PARAMS = {
     'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
     'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)),
     'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)),
     # 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)),
 }
 
-catboost_regressor_params = {
+CATBOOST_REGRESSOR_PARAMS = {
     'iterations': 2,
     'depth': hp.choice('depth_catboost', range(4, 10)),
     'learning_rate': 1,
@@ -157,62 +182,62 @@ def update(self, options):
 }
 
 
-regression_hspace = {
+REGRESSION_HPARAM_SPACE = {
     'extratree': {
         'model': ExtraTreesRegressor,
-        'param': extra_trees_regressor_params
+        'param': EXTRA_TREES_REGRESSOR_PARAMS
     },
     'ridge': {
         'model': Ridge,
-        'param': ridge_params
+        'param': RIDGE_REGRESSOR_PARAMS
     },
     'random_forest': {
         'model': RandomForestRegressor,
-        'param': random_forest_regressor_params
+        'param': RANDOM_FOREST_REGRESSOR_PARAMS
     },
     'lgbm': {
         'model': LGBMRegressor,
-        'param': lgbm_regressor_params
+        'param': LGBM_REGRESSOR_PARAMS
     },
     'adaboost': {
         'model': AdaBoostRegressor,
-        'param': adaboost_regressor_params
+        'param': ADABOOST_REGRESSOR_PARAMS
      },
     'catboost': {
         'model': CatBoostRegressor,
-        'param': catboost_regressor_params
+        'param': CATBOOST_REGRESSOR_PARAMS
     }
 }
 
 
-classification_hspace = {
+CLASSIFICATION_HPARAM_SPACE = {
     'knn': {
         'model': KNeighborsClassifier,
-        'param': knn_classifier_params
+        'param': KNN_CLASSIFIER_PARAMS
     },
     'svm': {
         'model': SVC,
-        'param': svc_params
+        'param': SVM_CLASSIFIER_PARAMS
     },
     'random_forest': {
         'model': RandomForestClassifier,
-        'param': random_forest_classifier_params
+        'param': RANDOM_FOREST_CLASSIFIER_PARAMS
     },
     'lgbm': {
         'model': LGBMClassifier,
-        'param': lgbm_classifier_params
+        'param': LGBM_CLASSIFIER_PARAMS
     },
     'adaboost': {
         'model': AdaBoostClassifier,
-        'param': adaboost_classifier_params
+        'param': ADABOOST_CLASSIFIER_PARAMS
     },
     'catboost': {
         'model': CatBoostClassifier,
-        'param': catboost_classifier_params
+        'param': CATBOOST_CLASSIFIER_PARAMS
     }
 }
 
-classification_hspace_base = {
+CLASSIFICATION_BASE_HPARAM_SPACE = {
     'knn': {
         'model': KNeighborsClassifier,
         'param': {}
@@ -239,7 +264,7 @@ def update(self, options):
     }
 }
 
-regression_hspace_base = {
+REGRESSION_BASE_HPARAM_SPACE = {
     'extratree': {
         'model': ExtraTreesRegressor,
         'param': {}
@@ -266,7 +291,7 @@ def update(self, options):
     }
 }
 
-regression_p_hspace_base = {
+REGRESSION_PREP_HPARAM_SPACE = {
     'cat_encoding': hp.choice('cat_enc', ['count', 'target+count', 'target+label', 'label']),
     'scaling': hp.choice('scaling', [True, False]),
     'log_transform': hp.choice('log_transform', [True, False]),
@@ -282,7 +307,7 @@ def update(self, options):
     'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
 }
 
-classification_p_hspace_base = {
+CLASSIFICATION_PREP_HPARAM_SPACE = {
     'cat_encoding': hp.choice('cat_enc', ['target', 'count', 'target+count', 'target+label']),
     'scaling': hp.choice('scaling', [True, False]),
     'log_transform': hp.choice('log_transform', [True, False]),
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index 30e411f..fa394d2 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -17,62 +17,99 @@
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score
 import hyperopt
 from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
-from autokaggle.config import classification_hspace, regression_hspace
+from autokaggle.config import REGRESSION_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE,\
+    REGRESSION_BASE_HPARAM_SPACE
 
 
-lgbm_classifier_params = {
-    'n_estimators': hp.choice('n_estimators', [100, 150, 200]),
-}
+class EnsemblingModel:
+    """ Base class for ensembling estimators.
 
-_classification_hspace = {
-    'lgbm': {
-        'model': LGBMClassifier,
-        'param': lgbm_classifier_params
-    },
-}
+        This class creates an ensembling estimator from a given list of estimators. The user can call fit() and
+        predict() methods, similar to the scikit-learn estimators.
 
-
-class RankedEnsembler:
+        # Arguments
+            config: Config. Defines the configuration of various components of the autoML pipeline.
+            estimator_list: List. List of the estimators, to be used for building an ensemble.
+    """
     def __init__(self, estimator_list, config):
         self.config = config
-        self.ensemble_method = config.ensemble_method
-        self.estimators = estimator_list
-        
+        self.estimator_list = estimator_list
+
+    @abstractmethod
+    def fit(self, X, y):
+        """ Trains the ensemble of estimators on the training data.
+        # Arguments
+            X: A numpy array instance containing the training data.
+        # Returns
+            None
+        """
+        pass
+
+    @abstractmethod
+    def predict(self, X):
+        """ Generate prediction on the test data for the given task.
+        # Arguments
+            X: A numpy array instance containing the test data.
+        # Returns
+            A numpy array for the predictions on the x_test.
+        This function provides predicts on the input data using the ensemble of estimators.
+        """
+        pass
+
+
+class RankedEnsemblingModel(EnsemblingModel):
+    """ Implements ensembling using ranking based methods.
+
+        This class implements randing based ensembling using ensembling methods amongst: ('mean', 'median', 'max' and
+        'majority_voting')
+    """
     def fit(self, X, y):
-        for est in self.estimators:
+        for est in self.estimator_list:
             est.fit(X, y)
     
     def predict(self, X):
-        predictions = np.zeros((len(X), len(self.estimators)))
-        for i, est in enumerate(self.estimators):
+        predictions = np.zeros((len(X), len(self.estimator_list)))
+        for i, est in enumerate(self.estimator_list):
             predictions[:, i] = est.predict(X)
 
-        if self.ensemble_method == 'median':
+        if self.config.ensemble_method == 'median':
             return np.median(predictions, axis=1)
-        elif self.ensemble_method == 'mean':
+        elif self.config.ensemble_method == 'mean':
             return np.mean(predictions, axis=1)
-        elif self.ensemble_method == 'max':
+        elif self.config.ensemble_method == 'max':
             return np.max(predictions, axis=1)
-        elif self.ensemble_method == 'min':
+        elif self.config.ensemble_method == 'min':
             return np.min(predictions, axis=1)
-        elif self.ensemble_method == 'max_voting':
+        elif self.config.ensemble_method == 'max_voting':
             return stats.mode(predictions, axis=1)[0]
 
 
-class StackingEnsembler:
-    stacking_estimator = None
+class StackedEnsemblingModel(EnsemblingModel):
+    """ Implements a stacking based ensembling estimator.
+
+        This class creates an ensembling estimator using stacking. It trains an Light-GBM model on the predictions of
+        thh base estimator.
 
+        # Arguments
+            stacking_estimator: LightGBM estimator. Meta-learning algorithm for the stacking estimator.
+    """
     def __init__(self, estimator_list, config):
-        self.config = config
-        self.estimator_list = estimator_list
-        self.objective = config.objective
+        super().__init__(estimator_list, config)
+        self.stacking_estimator = None
+
         if self.config.objective == 'regression':
-            self.hparams = hp.choice('regressor', [regression_hspace[m] for m in ['lgbm']])
+            self.hparams = hp.choice('regressor', [REGRESSION_BASE_HPARAM_SPACE['lgbm']])
             self.config.stack_probabilities = False
         else:
-            self.hparams = hp.choice('classifier', [_classification_hspace[m] for m in ['lgbm']])
+            self.hparams = hp.choice('classifier', [CLASSIFICATION_BASE_HPARAM_SPACE['lgbm']])
 
     def get_model_predictions(self, X):
+        """ Generate the combined predictions from the list of the estimators.
+        # Arguments
+            X: A numpy array instance containing the training/test data.
+        # Returns
+            A numpy array for the predictions of all the estimators in the list.
+        """
         if self.config.stack_probabilities:
             predictions = np.zeros((len(X), 1))
             for i, est in enumerate(self.estimator_list):
@@ -98,6 +135,12 @@ def fit(self, X, y):
         self.stacking_estimator.fit(predictions, y_val)
 
     def search(self, x, y):
+        """ Search function to find best hyper-param setting for the stacking model.
+        # Arguments
+            x: A numpy array instance containing the training data
+        # Returns
+            List of trials on various hyper-parameter settings.
+        """
         score_metric, skf = self.get_skf(self.config.cv_folds)
 
         def objective_func(args):
@@ -112,8 +155,8 @@ def objective_func(args):
             return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
 
         trials = Trials()
-        best = fmin(objective_func, self.hparams, algo=self.config.ensembling_algo, trials=trials,
-                    max_evals=self.config.ensembling_search_iter)
+        best = fmin(objective_func, self.hparams, algo=self.config.search_algo, trials=trials,
+                    max_evals=self.config.ensembling_search_iter, rstate=np.random.RandomState(self.config.random_state))
 
         opt = space_eval(self.hparams, best)
         best_estimator_ = opt['model'](**opt['param'])
@@ -127,6 +170,12 @@ def predict(self, X):
         return self.stacking_estimator.predict(predictions)
 
     def get_skf(self, folds):
+        """ Get scoring metric and cross validation folds for the task type
+        # Arguments
+            folds: Number of cross validation folds
+        # Returns
+            Scoring metric and CV folds
+        """
         if self.config.objective == 'binary':
             score_metric = 'roc_auc'
             skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
@@ -139,72 +188,3 @@ def get_skf(self, folds):
         else:
             ValueError("Invalid objective")
         return score_metric, skf
-
-
-class EnsembleSelection:
-    indices_ = None
-    weights_ = None
-
-    def __init__(self, estimator_list, config):
-        self.estimator_list = estimator_list
-        self.config = config
-        self.objective = config.objective
-        self.indices_, self.weights_ = [], []
-        self.ensemble_size = len(estimator_list)
-        if self.objective == 'regression':
-            self.score_metric = 'neg_mean_squared_error'
-            self.skf = KFold(n_splits=3, shuffle=True, random_state=1001)
-        else:
-            self.score_metric = 'neg_mean_squared_error'
-            self.skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1001)
-
-    def fit(self, X, y_true):
-        """Rich Caruana's ensemble selection method. (fast version)"""
-        ensemble = []
-        trajectory = []
-        order = []
-
-        for i in range(self.ensemble_size):
-            scores = np.zeros((len(self.estimator_list)))
-            s = len(ensemble)
-            if s == 0:
-                weighted_ensemble_prediction = np.zeros(len(self.estimator_list))
-            else:
-                # Memory-efficient averaging!
-                ensemble_prediction = np.zeros(ensemble[0].shape)
-                for est in ensemble:
-                    ensemble_prediction += est
-                ensemble_prediction /= s
-
-                weighted_ensemble_prediction = (s / float(s + 1)) * ensemble_prediction
-            fant_ensemble_prediction = np.zeros(weighted_ensemble_prediction.shape)
-            for j, est in enumerate(self.estimator_list):
-                fant_ensemble_prediction[:,:] = weighted_ensemble_prediction + \
-                                             (1. / float(s + 1)) * est
-                scores[j] = cross_val_score(self.estimator_list[j], X, y_true, scoring=self.score_metric,
-                                            cv=self.skf).mean()
-
-            all_best = np.argwhere(scores == np.nanmin(scores)).flatten()
-            best = np.random.RandomState.choice(all_best)
-            ensemble.append(self.estimator_list[best])
-            trajectory.append(scores[best])
-            order.append(best)
-
-            # Handle special case
-            if len(self.estimator_list) == 1:
-                break
-
-        self.indices_ = order
-        ensemble_members = collections.Counter(self.indices_)
-        weights = np.zeros((self.ensemble_size,), dtype=float)
-        for ensemble_member in ensemble_members:
-            weight = float(ensemble_member[1]) / self.ensemble_size
-            weights[ensemble_member[0]] = weight
-
-        if np.sum(weights) < 1:
-            weights = weights / np.sum(weights)
-
-        self.weights_ = weights
-    
-    def predict(self, X):
-        return np.average(X, axis=1, weights=self.weights_)
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index 4b789d6..d286dd9 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -15,31 +15,30 @@
 LEVEL_HIGH = 32
 
 
-class TabularPreprocessor(TransformerMixin):
-    pipeline = None
-    data_info = None
-    params = None
-    config = None
-
+class Preprocessor(TransformerMixin):
+    """ Implements basic preprocessing and feature engineering class.
+
+        Preprocessor takes care of the basic preprocessing and feature engineering of the input data. Similar to
+        Scikit-learn transformers, it implements the fit() and transform() methods. TO acheive this It applies various
+        feature primitives in a sequence using scikit-learn pipeline.
+        # Arguments
+            config: Config. Defines the configuration of various components of the autoML pipeline.
+            params: Dict. Hyper-parameter search space for preprocessor.
+            pipeline: Pipeline. Sci-kit learn pipeline class to apply the feature primitives in sequence
+    """
     def __init__(self, config, params):
-        """
-        Initialization function for tabular preprocessor.
-        """
         self.config = config
         self.params = params
+        self.pipeline = None
 
     def fit(self, raw_x, y):
-        """
-        This function should train the model parameters.
-
-        Args:
-            raw_x: a numpy.ndarray instance containing the training data.
-            y: training label vector.
-            time_limit: remaining time budget.
-            data_info: meta-features of the dataset, which is an numpy.ndarray describing the
-             feature type of each column in raw_x. The feature type include:
-                     'TIME' for temporal feature, 'NUM' for other numerical feature,
-                     and 'CAT' for categorical feature.
+        """ This function trains the preprocessor chain
+        # Arguments
+            raw_x: A numpy array instance containing the training data data.
+            y: A numpy array instance containing training label vector.
+        # Returns
+            None
+        This function fits the preprocessor chain on the given training data
         """
         data = TabularData(raw_x, self.config.data_info, self.config.verbose)
 
@@ -57,15 +56,13 @@ def fit(self, raw_x, y):
         return self
 
     def transform(self, raw_x):
-        """
-        This function should train the model parameters.
-
-        Args:
-            raw_x: a numpy.ndarray instance containing the training/testing data.
-        Both inputs X and y are numpy arrays.
-        If fit is called multiple times on incremental data (train, test1, test2, etc.)
-        you should warm-start your training from the pre-trained model. Past data will
-        NOT be available for re-training.
+        """ Generate data transformation on the given data.
+        # Arguments
+            raw_x: a numpy array instance containing the training/testing data
+        # Returns
+            A numpy array instance containing the transformed data.
+        This function provides transforms the input data by applying the transformations using the pre-trained
+        preprocessor chain.
         """
         # Get Meta-Feature
         data = TabularData(raw_x, self.config.data_info, self.config.verbose)
@@ -74,6 +71,12 @@ def transform(self, raw_x):
 
     @staticmethod
     def get_categorical_pipeline(params):
+        """ Generate pipeline of primitives for categorical features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
         choice = params.get('cat_encoding', 'target')
         cat_pipeline = []
         if choice == 'target':
@@ -96,6 +99,12 @@ def get_categorical_pipeline(params):
 
     @staticmethod
     def get_numerical_pipeline(params):
+        """ Generate pipeline of primitives for numerical features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
         scaling = params.get('scaling', True)
         log_transform = params.get('log_transform', False)
         power_transform = params.get('power_transform', False)
@@ -116,6 +125,12 @@ def get_numerical_pipeline(params):
         return numeric_pipeline
 
     def get_filtering_pipeline(self, params):
+        """ Generate pipeline of primitives to filter less useful features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
         pearson_thresh = params.get('pearson_thresh', 0)
         feat_importance_thresh = params.get('feat_importance_thresh', 0)
 
@@ -132,6 +147,12 @@ def get_filtering_pipeline(self, params):
 
     @staticmethod
     def get_time_pipeline(params):
+        """ Generate pipeline of primitives for time features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
         add_offset = params.get('add_time_offset', False)
         add_diff = params.get('add_time_diff', False)
         time_pipeline = []
@@ -143,12 +164,24 @@ def get_time_pipeline(params):
 
     @staticmethod
     def get_imputation_pipeline(params):
+        """ Generate pipeline of primitives to impute the missing values.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
         strategy = params.get('imputation_strategy', 'most_frequent')
         impute_pipeline = [('imputer', Imputation(operation='upd', selected_type='ALL', strategy=strategy))]
         return impute_pipeline
 
     @staticmethod
     def get_higher_order_pipeline(params):
+        """ Generate pipeline of primitives to generate cross-column features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
         cat_num_strategy = params.get('cat_num_strategy', None)
         cat_cat_strategy = params.get('cat_cat_strategy', None)
         pipeline = []
@@ -162,16 +195,35 @@ def get_higher_order_pipeline(params):
 
 
 class TabularData:
-    cat_col = None
-    num_col = None
-    time_col = None
-    n_cat, n_time, n_num = 0, 0, 0
-    cat_cardinality = None
-    generated_features = None
-    feature_options = None
-    num_info = None
-
+    """ Represents the data and its meta-info.
+
+        TabularData includes the training/testing data along with its meta info such as data types, cardinality etc. The
+        user can update the data and its meta info as well as select the features matching the criteria.
+        # Arguments
+            verbose: Bool. Determines the verbosity of the logging.
+            data_info: Dict. Dictionary mapping the feature names to their data_types.
+            total_samples: Int. Number of samples in the data
+            cat_col: List. List of the categorical features
+            num_col: List. List of the numerical features
+            time_col: List. List of the time features
+            n_cat: Int. Number of categorical features
+            n_num: Int. Number of numerical features
+            n_time: Int. Number of time features
+            cat_cardinality: Dict. Dictionary mapping categorical feature names of their cardinality (no. of unique
+            values)
+            generated_features: List. List of the newly added features. (In addition to the pre-existing columns)
+            num_info: Dict. Dictionary mapping numeircal column to their meta info such as range, std etc.
+    """
     def __init__(self, raw_x, data_info, verbose=True):
+        self.cat_col = None
+        self.num_col = None
+        self.time_col = None
+        self.n_cat = 0
+        self.n_time = 0
+        self.n_num = 0
+        self.cat_cardinality = None
+        self.generated_features = None
+        self.num_info = None
         self.verbose = verbose
         self.data_info = {str(i): data_info[i] for i in range(len(data_info))}
         self.total_samples = raw_x.shape[0]
@@ -188,14 +240,35 @@ def __init__(self, raw_x, data_info, verbose=True):
         # self.update_cat_cardinality()
 
     def update_type(self, columns, new_type):
+        """ Updates the column datatype.
+        # Arguments
+            column: List of columns whose data_type needs update.
+            new_type: New data_type (either of 'CAT', 'NUM' or 'TIME').
+        # Returns
+            None.
+        This function updates the data types of given list of columns.
+        """
         for c in columns:
             self.data_info[c] = new_type
 
     def delete_type(self, columns):
+        """ Delete the columns from the feature to data_type mapping.
+        # Arguments
+            column: List of columns whose data_type needs update.
+        # Returns
+            None
+        This function removes the selected columns from the data_info dictionary.
+        """
         for c in columns:
             _ = self.data_info.pop(c, 0)
 
     def rename_cols(self, key):
+        """ Provides a rename function to add new columns without collision.
+        # Arguments
+            key: Identifier for renaming
+        # Returns
+            Renaming function which takes current column name and outputs a new unique column name.
+        """
         def rename_fn(col_name):
             col_name = str(col_name)
             col_name += '_' + key
@@ -205,6 +278,18 @@ def rename_fn(col_name):
         return rename_fn
 
     def update(self, operation, columns, x_tr, new_type=None, key=''):
+        """ Updates the TabularData after applying primitive.
+        # Arguments
+            operation: Primitive operation applied ('add', 'update' or 'delete').
+            columns: List of columns affected.
+            x_tr: Transformed (or newly generated) features
+            new_type: Data type of the new column
+            key: Name key for renaming the new columns
+        # Returns
+            None
+        This function takes the transformed (or generated) features after applying the primitive and updates the
+        TabularData.
+        """
         if operation == 'upd':
             if x_tr is not None:
                 self.X[columns] = x_tr
@@ -224,6 +309,13 @@ def update(self, operation, columns, x_tr, new_type=None, key=''):
         self.refresh_col_types()
 
     def refresh_col_types(self):
+        """ Updates the column_types based on the data_info
+        # Arguments
+            None
+        # Returns
+            None
+        This function updates the cat, num and time column lists based on (any) updates in the data_info.
+        """
         self.cat_col = [k for k, v in self.data_info.items() if v == 'CAT']
         self.num_col = [k for k, v in self.data_info.items() if v == 'NUM']
         self.time_col = [k for k, v in self.data_info.items() if v == 'TIME']
@@ -232,6 +324,12 @@ def refresh_col_types(self):
         self.n_cat = len(self.cat_col)
 
     def update_cat_cardinality(self):
+        """ Update categorical cardinality mapping for all categorical columns.
+        # Arguments
+            None
+        # Returns
+            None
+        """
         # TODO: too slow make it faster
         if not self.cat_cardinality:
             self.cat_cardinality = {}
@@ -239,6 +337,12 @@ def update_cat_cardinality(self):
             self.cat_cardinality[c] = len(set(self.X[c]))
 
     def select_columns(self, data_type):
+        """ Returns all the columns matching the input data_type
+        # Arguments
+            data_type: Required type of the data (either of 'CAT', 'NUM', 'TIME' or 'ALL')
+        # Returns
+            List of the feature columns matching the input criteria.
+        """
         self.refresh_col_types()
         if data_type == 'CAT':
             return self.cat_col
@@ -254,22 +358,47 @@ def select_columns(self, data_type):
 
 
 class Primitive(BaseEstimator, TransformerMixin):
-    selected = None
-    drop_columns = None
-    options = None
-    supported_ops = ('add', 'upd', 'del')
-    name_key = ''
-
+    """ Base class for the single order data transformation function.
+
+        Primitive learns and applies the data transformation on a given set of features. The user can use fit() and
+        transform() functions to apply these transformations.
+
+        # Arguments
+            options: Dict. Special arguments specific to the given primitive.
+            selected_type: 'String'. Specifies the type of features the transformation is supposed to be applied to.
+            operation: 'String'. Specifies the type of operation from 'add', 'update' or 'delete'
+            name_key : 'String'. Signature key to rename the column after applying the primitive.
+            selected: 'List'. List of the selected features, on which the transformation will be applied
+            drop_columns: 'List'. List of the features which would be dropped after applying the transformation.
+            supported_ops: Tuple. Specifies the allowed list of operations for this primitive.
+    """
     def __init__(self, operation='upd', selected_type=None, **kwargs):
+        self.options = None
+        self.selected = None
+        self.drop_columns = None
+        self.supported_ops = ('add', 'upd', 'del')
         self.selected_type = selected_type
         self.operation = operation
         self.init_vars(**kwargs)
         self.name_key = self.__class__.__name__
 
     def init_vars(self, **kwargs):
+        """ Initialize the primitive specific variables (which are not defined in the base class)
+        # Arguments
+            kwargs: Dictionary containing primitive specific variables
+        # Returns
+            None.
+        """
         self.options = kwargs
 
     def fit(self, data, y=None):
+        """ A wrapper function to train the given primitive on the input training data.
+        # Arguments
+            data: A TabularData instance of training data.
+            y: A numpy array of the target values.
+        # Returns
+            None
+        """
         self.selected = data.select_columns(self.selected_type)
         if self.operation not in self.supported_ops:
             print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__))
@@ -279,28 +408,68 @@ def fit(self, data, y=None):
         return self._fit(data, y)
 
     def transform(self, data, y=None):
+        """ A wrapper function to generate transformation on the input data based on pre-trained primitive.
+        # Arguments
+            data: Input training/testing data in TabularData form.
+            y: A numpy array of the target values.
+        # Returns
+            A TabularData instance of the transformed data.
+        """
         if not self.selected:
             return data
         return self._transform(data, y)
 
     @abstractmethod
     def _fit(self, data, y=None):
+        """ Contains the actual implementation of training the primitive (implemented in the child class)
+        # Arguments
+            data: A TabularData instance of training data.
+            y: A numpy array of the target values.
+        # Returns
+            None
+        """
         pass
 
     @abstractmethod
     def _transform(self, data, y=None):
+        """ Contains the actual implementation of transforming the data using primitive. (implemented in the child
+        class)
+        # Arguments
+            data: Input training/testing data in TabularData form.
+            y: A numpy array of the target values.
+        # Returns
+            A TabularData instance of the transformed data.
+        """
         pass
 
 
 class PrimitiveHigherOrder:
-    selected_1 = None
-    selected_2 = None
-    drop_columns = None
-    options = None
-    supported_ops = ('add', 'upd', 'del')
-    name_key = ''
-
+    """ Base class for the cross-order data transformation function.
+
+        PrimitiveHigherOrder learns and applies the data transformation across two sets of features. The user can use
+        fit() and transform() functions to apply these transformations.
+
+        # Arguments
+            options: Dict. Special arguments specific to the given primitive.
+            selected_type1: 'String'. Specifies the first type of features the transformation is supposed to be applied
+            to.
+            selected_type2: 'String'. Specifies the second type of features the transformation is supposed to be applied
+            to.
+            operation: 'String'. Specifies the type of operation from 'add', 'update' or 'delete'
+            name_key : 'String'. Signature key to rename the column after applying the primitive.
+            selected_1: 'List'. List of the selected features in the first set, on which the transformation will be
+            applied
+            selected_2: 'List'. List of the selected features in the second set, on which the transformation will be
+            applied
+            drop_columns: 'List'. List of the features which would be dropped after applying the transformation.
+            supported_ops: Tuple. Specifies the allowed list of operations for this primitive.
+    """
     def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs):
+        self.options = None
+        self.selected_1 = None
+        self.selected_2 = None
+        self.drop_columns = None
+        self.supported_ops = ('add', 'upd', 'del')
         self.operation = operation
         self.selected_type1 = selected_type1
         self.selected_type2 = selected_type2
@@ -308,9 +477,22 @@ def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **
         self.name_key = self.__class__.__name__
 
     def init_vars(self, **kwargs):
+        """ Initialize the primitive specific variables (which are not defined in the base class)
+        # Arguments
+            kwargs: Dictionary containing primitive specific variables
+        # Returns
+            None.
+        """
         self.options = kwargs
 
     def fit(self, data, y=None):
+        """ A wrapper function to train the given primitive on the input training data.
+        # Arguments
+            data: A TabularData instance of training data.
+            y: A numpy array of the target values.
+        # Returns
+            None
+        """
         self.selected_1 = data.select_columns(self.selected_type1)
         self.selected_2 = data.select_columns(self.selected_type2)
 
@@ -323,20 +505,49 @@ def fit(self, data, y=None):
         return self._fit(data, y)
 
     def transform(self, data, y=None):
+        """ A wrapper function to generate transformation on the input data based on pre-trained primitive.
+        # Arguments
+            data: Input training/testing data in TabularData form.
+            y: A numpy array of the target values.
+        # Returns
+            A TabularData instance of the transformed data.
+        """
         if not self.selected_1 or not self.selected_2:
             return data
         return self._transform(data, y)
 
     @abstractmethod
     def _fit(self, data, y=None):
+        """ Contains the actual implementation of training the primitive (implemented in the child class)
+        # Arguments
+            data: A TabularData instance of training data.
+            y: A numpy array of the target values.
+        # Returns
+            None
+        """
         pass
 
     @abstractmethod
     def _transform(self, data, y=None):
+        """ Contains the actual implementation of transforming the data using primitive. (implemented in the child
+        class)
+        # Arguments
+            data: Input training/testing data in TabularData form.
+            y: A numpy array of the target values.
+        # Returns
+            A TabularData instance of the transformed data.
+        """
         pass
 
 
 class TabScaler(Primitive):
+    """ Standard Scaler primitive.
+
+        TabScaler scales the selected numerical features to have 0 mean and unit variance.
+
+        # Arguments
+            scaler: StandardScaler. Instance of scikit-learn StandardScaler object
+    """
     scaler = None
     supported_ops = ('add', 'upd')
 
@@ -352,6 +563,13 @@ def _transform(self, data, y=None):
 
 
 class BoxCox(Primitive):
+    """ Power Transform primitive.
+
+        The class applies BoxCox power transformation to make the selected features have normal distribution.
+
+        # Arguments
+            transformer: PowerTransformer. Instance of scikit-learn PowerTransformer object
+    """
     transformer = None
     supported_ops = ('add', 'upd')
 
@@ -367,6 +585,17 @@ def _transform(self, data, y=None):
 
 
 class Binning(Primitive):
+    """ Numerical binning primitive.
+
+        The class applies divides the given numeric column in the list of buckets, based on the range of their values.
+
+        # Arguments
+            binner: KBinsDiscretizer. Instance of scikit-learn KBinsDiscretizer object
+            strategy: String. Strategy used to define width of the bins. Possible options are: (‘uniform’, ‘quantile’,
+            ‘kmeans’)
+            encoding: String. Method used to encode the transformed result. Possible options are: (‘onehot’,
+            ‘onehot-dense’, ‘ordinal’)
+    """
     binner = None
     strategy = None
     encoding = None
@@ -388,6 +617,13 @@ def _transform(self, data, y=None):
 
 
 class OneHot(Primitive):
+    """ One Hot Encoder for categorical features.
+
+        The class applies one hot encoding to categorical features, using the sklearn implementation.
+
+        # Arguments
+            ohe: OneHotEncoder. Instance of scikit-learn OneHotEncoder object
+    """
     ohe = None
     supported_ops = ('add', 'upd')
 
@@ -407,6 +643,15 @@ def _transform(self, data, y=None):
 
 
 class LabelEncode(Primitive):
+    """ Label Encoder for categorical features.
+
+        The class applies Label Encoding to categorical features, By mapping each category to a numerical value.
+
+        # Arguments
+            cat_to_int_label: Dict. Mapping from categories to their assigned integer value
+            unknown_key_dict: Dict. Mapping for each categorical feature column to the integer value to replace the
+            previously unseen categories
+    """
     cat_to_int_label = None
     unknown_key_dict = None
     supported_ops = ('add', 'upd')
@@ -428,11 +673,28 @@ def _transform(self, data, y=None):
 
 
 class TargetEncoder(Primitive):
+    """ Target Encoder for categorical features.
+
+        The class applies target encoding to categorical features, By learning the mapping of category to numeric value
+        based on some aggregation of the target value.
+
+        # Arguments
+            target_encoding_map: Dict. Mapping from categories to their assigned numeric value
+    """
     target_encoding_map = None
     supported_ops = ('add', 'upd')
 
     @staticmethod
     def calc_smooth_mean(df, by, on, alpha=5):
+        """ Calculates the smoothed means on the target value.
+        # Arguments
+            df: Input dataframe
+            by: Groupby column (categorical column)
+            on: Target column
+            alpha: smoothing factor
+        # Returns
+            smoothed mean and the overall mean
+        """
         # Compute the global mean
         mean = df[on].mean()
 
@@ -463,6 +725,14 @@ def _transform(self, data, y=None):
 
 
 class CatCatEncoder(PrimitiveHigherOrder):
+    """ Cross column feature generator between categorical and categorical columns.
+
+        The class learns a new features based on the values of selected two categorical features.
+
+        # Arguments
+            cat_cat_map: Dict. Mapping from cat-cat combination to the an assigned numeric value
+            strategy: String. Aggregation strategy to learn the mapping between cat-cat combination to numeric value
+    """
     supported_ops = ('add', )
     cat_cat_map = None
     strategy = None
@@ -472,6 +742,14 @@ def init_vars(self, strategy='count'):
 
     @staticmethod
     def cat_cat_count(df, col1, col2, strategy='count'):
+        """ Generate mapping for cat-cat combination to the numerical value based on the given strategy.
+        # Arguments
+            col1: First categorical column
+            col2: Second categorical column
+            strategy: Aggregation strategy
+        # Returns
+            Mapping from cat-cat combination to the numeric value..
+        """
         if strategy == 'count':
             mapping = df.groupby([col1])[col2].count()
         elif strategy == 'nunique':
@@ -497,6 +775,14 @@ def _transform(self, data, y=None):
 
 
 class CatNumEncoder(PrimitiveHigherOrder):
+    """ Cross column feature generator between categorical and numerical columns.
+
+        The class learns a new features based on the values of selected categorical and numerical features.
+
+        # Arguments
+            cat_num_map: Dict. Mapping from cat-num combination to the an assigned numeric value
+            strategy: String. Aggregation strategy to learn the mapping between cat-num combination to numeric value
+    """
     supported_ops = ('add', )
     cat_num_map = None
     strategy = None
@@ -506,6 +792,14 @@ def init_vars(self, strategy='mean'):
 
     @staticmethod
     def cat_num_interaction(df, col1, col2, method='mean'):
+        """ Generate mapping for cat-num combination to the numerical value based on the given strategy.
+        # Arguments
+            col1: categorical column
+            col2: numerical column
+            method: Aggregation strategy
+        # Returns
+            Mapping from cat-num combination to the numeric value..
+        """
         if method == 'mean':
             mapping = df.groupby([col1])[col2].mean()
         elif method == 'std':
@@ -537,6 +831,14 @@ def _transform(self, data, y=None):
 
 
 class CatBinEncoder(PrimitiveHigherOrder):
+    """ Cross column feature generator between categorical and binary columns.
+
+        The class learns a new features based on the values of selected categorical and binary features.
+
+        # Arguments
+            cat_bin_map: Dict. Mapping from cat-bin combination to the an assigned numeric value
+            strategy: String. Aggregation strategy to learn the mapping between cat-bin combination to numeric value
+    """
     supported_ops = ('add', )
     cat_bin_map = None
     strategy = None
@@ -546,6 +848,14 @@ def init_vars(self, strategy='percent_true'):
 
     @staticmethod
     def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
+        """ Generate mapping for cat-bin combination to the numerical value based on the given strategy.
+        # Arguments
+            col1: Categorical column
+            col2: Binary column
+            strategy: Aggregation strategy
+        # Returns
+            Mapping from cat-bin combination to the numeric value..
+        """
         if strategy == 'percent_true':
             mapping = df.groupby([col1])[col2].mean()
         elif strategy == 'count':
@@ -572,6 +882,11 @@ def _transform(self, data, y=None):
 
 
 class FilterConstant(Primitive):
+    """ Filters the constant or very low variance columns.
+
+        The class finds the non-changing or very low variance columns and marked them for deletion, so that they are
+        not used by the machine learning estimator.
+    """
     drop_columns = None
     supported_ops = ('del',)
 
@@ -586,6 +901,10 @@ def _transform(self, data, y=None):
 
 
 class TimeDiff(Primitive):
+    """ Adds features based on difference of time values.
+
+        This class generates the features as time difference between two selected time columns.
+    """
     supported_ops = ('add', )
 
     def _fit(self, data, y=None):
@@ -600,6 +919,13 @@ def _transform(self, data, y=None):
 
 
 class TimeOffset(Primitive):
+    """ Updates the time features in terms of difference from the start value.
+
+        This class updates the time features such that they are represented as a difference from the start time.
+
+        # Arguments
+            start_time: Int. Starting time of the selected time feature.
+    """
     start_time = None
     supported_ops = ('add', 'upd')
 
@@ -615,6 +941,12 @@ def _transform(self, data, y=None):
 
 
 class TabPCA(Primitive):
+    """ Generates new features by finding PCA of the selected features.
+
+        The class calculates the PCA of the selected features and adds the transformation as new set of features.
+        # Arguments
+            pca: PCA. Scikit-lean PCA class.
+    """
     pca = None
     supported_ops = ('add', )
 
@@ -631,6 +963,13 @@ def _transform(self, data, y=None):
 
 
 class CatCount(Primitive):
+    """ Count Encoding.
+
+        Replaces the cargorical variables by their occrance count.
+        # Arguments
+            count_dict: Dict. Mapping of the categories to their respective frequency count.
+            unknown_key: Float. Mapping value for previously unseen category.
+    """
     count_dict = None
     unknown_key = 0
     supported_ops = ('add', 'upd')
@@ -650,6 +989,11 @@ def _transform(self, data, y=None):
 
 
 class LogTransform(Primitive):
+    """ Calculates the log transformation.
+
+        The class Calculates the log transform value of the given numeric feature. The formula is:
+        sign(x) * log(1 + mod(x))
+    """
     name_key = 'log_'
     supported_ops = ('add', 'upd')
 
@@ -665,6 +1009,12 @@ def _transform(self, data, y=None):
 
 
 class Imputation(Primitive):
+    """ Filters the features based on Pearson Correlation.
+
+        The class removes the features who have low pearson correlation with the target.
+        # Arguments
+            threshold: Float. Threshold for filtering features.
+    """
     impute_dict = None
     supported_ops = ('add', 'upd')
     strategy = None
@@ -693,6 +1043,12 @@ def _transform(self, data, y=None):
 
 
 class FeatureFilter(Primitive):
+    """ Filters the features based on Pearson Correlation.
+
+        The class removes the features who have low pearson correlation with the target.
+        # Arguments
+            threshold: Float. Threshold for filtering features.
+    """
     threshold = None
     supported_ops = ('del',)
 
@@ -717,6 +1073,14 @@ def _transform(self, data, y=None):
 
 
 class FeatureImportance(Primitive):
+    """ Filters the features based on feature importance score.
+
+        The class learns a Light GBM estimator for the given data and based on the feature importance scores, filters
+        the features with importance lower than the threshold.
+        # Arguments
+            threshold: Float. Threshold for filtering features.
+            task_type: 'String'. Specifies the task type amongst: ('classification', 'regression')
+    """
     threshold = None
     task_type = 'classification'
     supported_ops = ('del',)
@@ -776,7 +1140,7 @@ def _transform(self, data, y=None):
 
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     print(x_train[:4, 20])
-    prep = TabularPreprocessor()
+    prep = Preprocessor()
     prep.fit(x_train, y_train, 24*60*60, datainfo)
     x_new = prep.transform(x_train)
 
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 604d92e..0b5b22a 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -16,6 +16,7 @@
 openml.config.apikey = '3c7196c92a274c3b9405a7e26e9f848e'
 import warnings
 from abc import abstractmethod
+import statistics
 
 
 def generate_rand_string(size):
@@ -24,6 +25,20 @@ def generate_rand_string(size):
 
 
 class BenchmarkingBase:
+    """ Base class for benchmarking autoML platforms.
+
+        This class benchmarks the performance of the given autoML platform. The user can call evaluate() method to
+        evaluate the performance on a single task or run_automation() for the list of the tasks. The tasks are OpenML
+        tasks, which specify the dataset and the train/test/validation folds etc.
+
+        # Arguments
+            results: List. List of the results for each evaluation
+            sess_name: String. Name of the evaluation session, used for storing the results.
+            cls_desc: List. List of the columns to be added in classification result
+            rgs_desc: List. List of the columns to be added in regression result
+            cls_results: DataFrame. Table storing the classification results
+            rgs_results: DataFrame. Table storing the regression results
+    """
     results = None
     cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", "balanced_accuracy", "F1_score", "AUC"]
     rgs_desc = ["automl_model", "task_id", "time_limit", "MSE", "MAE", "R2_score"]
@@ -37,6 +52,14 @@ def __init__(self, supress_warnings=True, sess_name=""):
         self.rgs_results = pd.DataFrame(columns=self.rgs_desc)
         
     def measure_performance_cls(self, y_true, y_pred, binary=False):
+        """ Calculate the performance of the classification task
+        # Arguments
+            y_true: A numpy array containing the ground truth labels
+            y_pred: A numpy array containing the predicted labels
+            binary: Boolean specifying if the objective isbinary or multiclass
+        # Returns
+            list of the performance scores based on various evaluation metrics.
+        """
         accuracy = accuracy_score(y_true, y_pred)
         ber = balanced_accuracy_score(y_true, y_pred)
         f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score(y_true, y_pred, average="weighted")
@@ -44,12 +67,25 @@ def measure_performance_cls(self, y_true, y_pred, binary=False):
         return [accuracy, ber, f1, auc]
 
     def measure_performance_rgs(self, y_true, y_pred):
+        """ Calculate the performance of the regression task
+        # Arguments
+            y_true: A numpy array containing the ground truth
+            y_pred: A numpy array containing the predicted values
+        # Returns
+            list of the performance scores based on various evaluation metrics.
+        """
         mse = mean_squared_error(y_true, y_pred)
         mae = mean_absolute_error(y_true, y_pred)
         r2 = r2_score(y_true, y_pred)
         return [mse, mae, r2]
     
     def export_results(self):
+        """ Writes the results to a CSV file.
+        # Arguments
+            None
+        # Returns
+            None
+        """
         if len(self.cls_results) > 0:
             self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False)
         if len(self.rgs_results) > 0:
@@ -57,9 +93,23 @@ def export_results(self):
     
     @abstractmethod
     def evaluate(self, task, time_limit):
+        """ Evaluates the performance of the single task.
+        # Arguments
+            task: Id of the OpenML task flow
+            time_limit: Budget for the given task
+        # Returns
+            List of performance scores of the autoML system on the given task.
+        """
         pass
         
     def run_automation(self, task_list, time_limit=10*60):
+        """ Evaluate the list of the tasks in sequence
+        # Arguments
+            task_list: List of OpenML task ids
+            time_limit: Budget for each of the task
+        # Returns
+            None
+        """
         for task in task_list:
             try:
                 self.evaluate(task, time_limit=time_limit)
@@ -68,12 +118,27 @@ def run_automation(self, task_list, time_limit=10*60):
                 print("task: {} didnt work".format(task))
                 
     def time_lapse(self, task_id, time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]):
+        """ Evaluate the task on different time_limits
+        # Arguments
+            task_id: Id of the OpenML task flow
+            time_limits: List of the time_limits to test the performance on
+        # Returns
+            List of combined results of the autoML on each of the time_limit
+        This function evaluates and compares the performance of the autoML system on different time_limits. It is
+        helpful to understand the amount of improvement with increase in time budget
+        """
         tl_results = []
         for time_limit in time_limits:
             tl_results.append(self.evaluate(task_id, time_limit=time_limit))
         return tl_results
     
     def get_dataset_splits(self, task_id):
+        """ Get the train/test splits for the given task
+        # Arguments
+            task_id: Id of OpenML task flow
+        # Returns
+            Train/Test datasets in numpy array format
+        """
         task = openml.tasks.get_task(task_id)
         train_indices, test_indices = task.get_train_test_split_indices()
         dataset = task.get_dataset()
@@ -85,12 +150,18 @@ def get_dataset_splits(self, task_id):
     
     
 class BenchmarkingAutoKaggle(BenchmarkingBase):
-    estimator_type = None
-    
-    def set_estimator_type(self, est_type):
-        self.estimator_type = est_type
-        
+    """ Extends the benchmarking class for evaluating AutoKaggle.
+
+        This class evaluates the performance of AutoKaggle on the input classification or regression task_list.
+    """
     def get_data_info(self, dataset, num_cols):
+        """ Get the info of each feature data type
+        # Arguments
+            dataset: dataset id in OpenML
+            num_cols: Total number of columns
+        # Returns
+            A numpy array containing the data_type of each feature column
+        """
         nominal_feat = dataset.get_features_by_type('nominal')
         numerical_feat = dataset.get_features_by_type('numeric')
         string_feat = dataset.get_features_by_type('string')
@@ -107,6 +178,9 @@ def get_data_info(self, dataset, num_cols):
         return np.array(data_info)
     
     def evaluate(self, task_id, time_limit=10*60):
+        """
+            See base class.
+        """
         task_info = ["autokaggle", task_id, time_limit]
         task = openml.tasks.get_task(task_id)
         train_indices, test_indices = task.get_train_test_split_indices()
@@ -121,9 +195,9 @@ def evaluate(self, task_id, time_limit=10*60):
 
         # Train
         if task.task_type == 'Supervised Classification':
-            automl = AutoKaggleClassifier()
+            automl = Classifier()
         elif task.task_type == 'Supervised Regression':
-            automl = AutoKaggleRegressor()
+            automl = Regressor()
         else:
             print("UNSUPPORTED TASK_TYPE")
             assert(0)
@@ -145,6 +219,10 @@ def evaluate(self, task_id, time_limit=10*60):
 
 #
 # class BenchmarkingAutoSklearn(BenchmarkingBase):
+    """ Extends the benchmarking class for evaluating AutoSklearn.
+    
+        This class evaluates the performance of AutoKaggle on the input classification or regression task_list.
+    """
 #     def get_data_info(self, categorical_indicator):
 #         return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator]
 #
@@ -187,14 +265,29 @@ def evaluate(self, task_id, time_limit=10*60):
 #         self.results.append(result)
 #         print(result)
 #         return result
-    
+
+
 def get_dataset_ids(task_ids):
+    """ Fetches the dataset_ids.
+    # Arguments
+        task_ids: List of ids of OpenML task flows
+    # Returns
+        dataset_list: List of the dataset Ids
+    """
     if type(task_ids) == list:
-        return  [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids]
+        return [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids]
     else:
-        return  openml.tasks.get_task(task_ids).dataset_id
+        return openml.tasks.get_task(task_ids).dataset_id
+
 
 def get_task_info(task_ids):
+    """ Fetches the dataset_ids and the task objective.
+    # Arguments
+        task_ids: List of ids of OpenML task flows.
+    # Returns
+        dataset_list: List of the dataset Ids.
+        task_types: List of the task type (such as 'binary/multiclass classification' or 'regression'
+    """
     task_types = []
     dataset_list = []
     for i, t_id in enumerate(task_ids):
@@ -211,6 +304,13 @@ def get_task_info(task_ids):
 
 
 def get_dataset_properties(task_ids):
+    """ Fetches the properties of the dataset for given task flow id
+    # Arguments
+        task_ids: List of ids of OpenML task flows
+    # Returns
+        Dataframe containing the info of each of the dataset.
+    This function provides the dataset info such as number of instances, number of numeric/nominal/string columns etc.
+    """
     dataset_list, task_types = get_task_info(task_ids)
     df = pd.DataFrame(columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", "#Date"])
     for i, dataset in enumerate(dataset_list):
@@ -225,9 +325,16 @@ def get_dataset_properties(task_ids):
         ]
     return df
 
+
 def get_performance_table(filename, metric):
-    """
-    Read the results csv and convert into the performance table based on the median of the results for each task.
+    """ Generates a comprehensive report table of AutoML performance.
+    # Arguments
+        filename: A csv file containing the results of AutoML runs
+        metric: Scoring metric to be used for comparison
+    # Returns
+        Pandas Dataframe listing the performance of different AutoML systems on the given datasets.
+    This function reads the results csv and converts it into the performance table based on the median of the results
+    for each task.
     """
     test = pd.read_csv(filename)
     perf = pd.DataFrame(columns=["Name", "AutoKaggle", "AutoSklearn", "H2O.ai"])
@@ -247,27 +354,37 @@ def get_performance_table(filename, metric):
             print(e)
     return perf
 
+
 def style_results(res):
-    """
-    Highlight the max results and set index to name
+    """ Highlights the best result in the results column
+    # Arguments
+        res: Dataframe containing the results of various AutoML runs
+    # Returns
+        Highlighed data-frame
     """
     def highlight_max(s):
-        '''
-        highlight the maximum in a Series yellow.
-        '''
+        """
+        Highlight the maximum in a Series yellow.
+        """
         is_max = s == s.max()
         return ['background-color: yellow' if v else '' for v in is_max]
     res = res.set_index("Name")
     res.style.apply(highlight_max, axis=1)
     return res
 
-import statistics
-def get_box_plot(data, task_id, metric):
-    """
-    Plots the boxplot of variance
+
+def get_box_plot(results, task_id, metric):
+    """ Generates a box plot of the variance in the result.
+    # Arguments
+        results: Results of various runs using AutoML systems
+        task_id: Id for OpenML task flow
+        metric: Score metric considered for the box-plot
+    # Returns
+        None
+    Builds and displays the box plot showing the variance in results for the AutoML performance on the given dataset.
     """
-    auto_sklearn = list(data.loc[(task_id, "autosklearn")][metric])
-    auto_kaggle = list(data.loc[(task_id, "autokaggle")][metric])
+    auto_sklearn = list(results.loc[(task_id, "autosklearn")][metric])
+    auto_kaggle = list(results.loc[(task_id, "autokaggle")][metric])
     med_sk = statistics.median(auto_sklearn)
     med_ak = statistics.median(auto_kaggle)
     while len(auto_sklearn) < len(auto_kaggle):
@@ -290,8 +407,10 @@ def get_box_plot(data, task_id, metric):
     #     ak.run_automation(classification_task_list)
     # t2 = time.time()
     # print(t2-t1)
+    np.random.seed(1001)
+    random.seed(1001)
     import time
     t1 = time.time()
-    ak.evaluate(31)
+    ak.evaluate(3021)
     t2 = time.time()
     print(t2-t1)
diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py
index fa8638d..e5d3b6f 100644
--- a/examples/tabular_classification_binary.py
+++ b/examples/tabular_classification_binary.py
@@ -19,7 +19,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggleClassifier()
+    clf = Classifier()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py
index 109d361..3426dd3 100644
--- a/examples/tabular_classification_multiclass.py
+++ b/examples/tabular_classification_multiclass.py
@@ -18,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggleClassifier()
+    clf = Classifier()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index ec5af7a..bf97bc7 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -18,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = AutoKaggleRegressor()
+    clf = Regressor()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 

From 7e651e67348a55a7409c5bad81dc7b90f95a5810 Mon Sep 17 00:00:00 2001
From: Anurag Kapale <akapale@tamu.edu>
Date: Thu, 26 Sep 2019 18:59:34 -0500
Subject: [PATCH 31/31] Fix Indent

---
 autokaggle/auto_ml.py      | 278 ++++++++++++++-------
 autokaggle/config.py       |  95 +++++---
 autokaggle/ensemblers.py   |  82 ++++---
 autokaggle/preprocessor.py | 478 ++++++++++++++++++++++++-------------
 autokaggle/utils.py        |  13 +-
 examples/benchmarking.py   | 150 +++++++-----
 6 files changed, 715 insertions(+), 381 deletions(-)

diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
index 4c2dc47..3cfef69 100644
--- a/autokaggle/auto_ml.py
+++ b/autokaggle/auto_ml.py
@@ -8,10 +8,13 @@
 from joblib import dump, load
 
 from autokaggle.preprocessor import Preprocessor
-from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, \
+    read_json
 from lightgbm import LGBMClassifier, LGBMRegressor
-from autokaggle.config import Config, CLASSIFICATION_PREP_HPARAM_SPACE, REGRESSION_PREP_HPARAM_SPACE, \
-    REGRESSION_BASE_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, REGRESSION_HPARAM_SPACE
+from autokaggle.config import Config, CLASSIFICATION_PREP_HPARAM_SPACE, \
+    REGRESSION_PREP_HPARAM_SPACE, \
+    REGRESSION_BASE_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE, \
+    CLASSIFICATION_HPARAM_SPACE, REGRESSION_HPARAM_SPACE
 from sklearn.model_selection import StratifiedKFold, KFold
 import hyperopt
 from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL
@@ -24,45 +27,63 @@
 class AutoKaggle(BaseEstimator):
     """ Automated Machine Learning system class.
 
-        AutoKaggle implements an end to end automated ML system. It initiates and searches for the optimum ML pipeline.
-        The user can use it with the simple `fit()` and  `predict()` methods like Sci-kit learn estimators.
-        The user can specify various parameters controlling different components of the system.
+        AutoKaggle implements an end to end automated ML system. It initiates and
+        searches for the optimum ML pipeline. The user can use it with the simple
+        `fit()` and  `predict()` methods like Sci-kit learn estimators.
+        The user can specify various parameters controlling different components
+        of the system.
         # Arguments
             path: String. OS path for storing temporary model parameters.
             verbose: Bool. Defines the verbosity of the logging.
             time_limit: Int. Time budget for performing search and fit pipeline.
             use_ensembling: Bool. Defines whether to use an ensemble of models
-            num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble
+            num_estimators_ensemble: Int. Maximum number of estimators to be used
+            in an ensemble
             ensemble_strategy: String. Strategy to ensemble models
-            ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling
-            random_ensemble: Bool. Whether the ensembling estimators are picked randomly.
-            diverse_ensemble: Bool. Whether estimators from different families are picked.
-            ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search
+            ensemble_method: String. Aggregation method if ensemble_strategy is
+            set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked
+            randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are
+            picked.
+            ensembling_search_iter: Int. Search iterations for ensembling
+            hyper-parameter search
             search_algo: String. Search strategy for hyper-parameter search.
             search_iter: Int. Number of iterations used for hyper-parameter search.
             cv_folds: Int. Number of Cross Validation folds.
-            subsample_ratio: Percent of subsample used for for hyper-parameter search.
+            subsample_ratio: Percent of subsample used for for hyper-parameter
+            search.
             data_info: list(String). Lists the datatypes of each feature column.
-            stack_probabilities: Bool. Whether to use class probabilities in ensembling.
+            stack_probabilities: Bool. Whether to use class probabilities in
+            ensembling.
             upsample_classes: Bool. Whether to upsample less represented classes
             num_p_hparams: Int. Number of preprocessor search spaces.
     """
 
     def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
-                 num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting',
-                 search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
-                 stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking',
+                 ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1,
+                 random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False,
+                 ensembling_search_iter=10,
                  search_algo='random', num_p_hparams=10):
         self.is_trained = False
         if not path:
             path = rand_temp_folder_generator()
-        self.config = Config(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling,
-                             num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy,
-                             ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds,
-                             subsample_ratio=subsample_ratio, random_ensemble=random_ensemble,
-                             diverse_ensemble=diverse_ensemble, stack_probabilities=stack_probabilities,
+        self.config = Config(path=path, verbose=verbose, time_limit=time_limit,
+                             use_ensembling=use_ensembling,
+                             num_estimators_ensemble=num_estimators_ensemble,
+                             ensemble_strategy=ensemble_strategy,
+                             ensemble_method=ensemble_method,
+                             search_iter=search_iter, cv_folds=cv_folds,
+                             subsample_ratio=subsample_ratio,
+                             random_ensemble=random_ensemble,
+                             diverse_ensemble=diverse_ensemble,
+                             stack_probabilities=stack_probabilities,
                              data_info=data_info, upsample_classes=upsample_classes,
-                             ensembling_search_iter=ensembling_search_iter, search_algo=search_algo,
+                             ensembling_search_iter=ensembling_search_iter,
+                             search_algo=search_algo,
                              num_p_hparams=num_p_hparams)
         self.pipeline = None
         self.m_hparams = None
@@ -75,20 +96,25 @@ def fit(self, x, y, time_limit=None, data_info=None):
             x: A numpy.ndarray instance containing the training data.
             y: training label vector.
             time_limit: remaining time budget.
-            data_info: meta-features of the dataset, which is an numpy.ndarray describing the feature type of each
-             column in raw_x. The feature type include: 'TIME' for temporal feature, 'NUM' for other numerical feature,
+            data_info: meta-features of the dataset, which is an numpy.ndarray
+            describing the feature type of each
+             column in raw_x. The feature type include: 'TIME' for temporal
+             feature, 'NUM' for other numerical feature,
              and 'CAT' for categorical feature.
         # Returns
             None
         Both inputs X and y are numpy arrays.
-        If fit is called multiple times on incremental data (train, test1, test2, etc.)
-        you should warm-start your training from the pre-trained model. Past data will
+        If fit is called multiple times on incremental data (train, test1, test2,
+        etc.)
+        you should warm-start your training from the pre-trained model. Past data
+        will
         NOT be available for re-training.
         """
         self.config.time_limit = time_limit if time_limit else 24 * 60 * 60
 
         # Extract or read data info
-        self.config.data_info = data_info if data_info is not None else self.extract_data_info(x)
+        self.config.data_info = data_info if data_info is not None else \
+            self.extract_data_info(x)
 
         if self.config.verbose:
             print('DATA_INFO: {}'.format(self.config.data_info))
@@ -130,7 +156,8 @@ def predict(self, x_test):
         return y
 
     def predict_proba(self, x_test):
-        """ Predict label probabilities on the test data for the given classification task.
+        """ Predict label probabilities on the test data for the given
+        classification task.
         # Arguments
             x_test: A numpy.ndarray instance containing the test data.
         # Returns
@@ -143,7 +170,8 @@ def predict_proba(self, x_test):
         return y
 
     def evaluate(self, x_test, y_test):
-        """ Predict label probabilities on the test data for the given classification task.
+        """ Predict label probabilities on the test data for the given
+        classification task.
         # Arguments
             x_test: A numpy.ndarray instance containing the training data.
             y_test: A numpy array with ground truth labels for the test data
@@ -171,7 +199,8 @@ def resample(self, x, y):
             Up-sampled version of the dataset
         """
         if self.config.upsample_classes:
-            x, y = SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(x, y)
+            x, y = SMOTE(
+                sampling_strategy=self.config.resampling_strategy).fit_resample(x, y)
         while x.shape[0] < 60:
             x = np.concatenate([x, x], axis=0)
             y = np.concatenate([y, y], axis=0)
@@ -207,16 +236,19 @@ def search(self, x, y, prep_space, model_space):
         # Returns
             List of hyper-parameter trials
         """
-        grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio)
+        grid_train_x, grid_train_y = \
+            self.subsample(x, y, sample_percent=self.config.subsample_ratio)
         score_metric, skf = self.get_skf(self.config.cv_folds)
 
         def objective_func(params):
             model_class = params['estimator']['model']
             m_params = params['estimator']['param']
             p_params = params['prep']
-            pipeline = AutoPipe(model_class=model_class, m_params=m_params, p_params=p_params, config=self.config)
+            pipeline = AutoPipe(model_class=model_class, m_params=m_params,
+                                p_params=p_params, config=self.config)
             try:
-                eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean()
+                eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y,
+                                             scoring=score_metric, cv=skf).mean()
                 status = STATUS_OK
             except ValueError as e:
                 print(e)
@@ -226,13 +258,16 @@ def objective_func(params):
                 print("CV Score:", eval_score)
                 print("\n=================")
             loss = 1 - eval_score if status == STATUS_OK else float('inf')
-            return {'loss': loss, 'status': status, 'model_class': model_class, 'm_params': m_params,
+            return {'loss': loss, 'status': status, 'model_class': model_class,
+                    'm_params': m_params,
                     'p_params': p_params}
 
         trials = Trials()
         search_space = {'prep': prep_space, 'estimator': model_space}
-        _ = fmin(objective_func, search_space, algo=self.config.search_algo, trials=trials,
-                 max_evals=self.config.search_iter, rstate=np.random.RandomState(self.config.random_state))
+        _ = fmin(objective_func, search_space, algo=self.config.search_algo,
+                 trials=trials,
+                 max_evals=self.config.search_iter,
+                 rstate=np.random.RandomState(self.config.random_state))
         return trials
 
     def get_best_pipeline(self, trials):
@@ -246,7 +281,8 @@ def get_best_pipeline(self, trials):
             best_pipeline = self.setup_ensemble(trials)
         else:
             opt = trials.best_trial['result']
-            best_pipeline = AutoPipe(opt['model_class'], opt['m_params'], opt['p_params'], self.config)
+            best_pipeline = AutoPipe(opt['model_class'], opt['m_params'],
+                                     opt['p_params'], self.config)
             if self.config.verbose:
                 print("The best hyperparameter setting found:")
                 print(opt)
@@ -288,7 +324,8 @@ def pick_diverse_estimators(self, trial_list):
         # Arguments
             trial_list: List of the hyper-parameter search trials.
         # Returns
-            List of top hyper-parameter spaces equally selected from each estimator family.
+            List of top hyper-parameter spaces equally selected from each
+            estimator family.
         """
         groups = collections.defaultdict(list)
 
@@ -299,7 +336,8 @@ def pick_diverse_estimators(self, trial_list):
         while idx < self.config.num_estimators_ensemble:
             for grp in groups.values():
                 if j < len(grp):
-                    est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'], grp[j]['p_params'], self.config)
+                    est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'],
+                                   grp[j]['p_params'], self.config)
                     estimator_list.append(est)
                     idx += 1
             j += 1
@@ -310,13 +348,15 @@ def setup_ensemble(self, trials):
         # Arguments
             trials: List of the hyper-parameter search trials.
         # Returns
-            An ensembling estimator to be trained using the base estimators picked from trials.
+            An ensembling estimator to be trained using the base estimators picked
+            from trials.
         """
         # Filter the unsuccessful hparam spaces i.e. 'loss' == float('inf')
         best_trials = [t for t in trials.results if t['loss'] != float('inf')]
         best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
 
-        self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials))
+        self.config.num_estimators_ensemble = min(
+            self.config.num_estimators_ensemble, len(best_trials))
 
         if self.config.random_ensemble:
             np.random.shuffle(best_trials)
@@ -326,20 +366,25 @@ def setup_ensemble(self, trials):
         else:
             estimator_list = []
             for i in range(self.config.num_estimators_ensemble):
-                est = AutoPipe(best_trials[i]['model_class'], best_trials[i]['m_params'], best_trials[i]['p_params'],
+                est = AutoPipe(best_trials[i]['model_class'],
+                               best_trials[i]['m_params'],
+                               best_trials[i]['p_params'],
                                self.config)
                 estimator_list.append(est)
 
         if self.config.ensemble_strategy == 'stacking':
-            best_estimator_ = StackedEnsemblingModel(estimator_list, config=self.config)
+            best_estimator_ = StackedEnsemblingModel(estimator_list,
+                                                     config=self.config)
         else:
-            best_estimator_ = RankedEnsemblingModel(estimator_list, config=self.config)
+            best_estimator_ = RankedEnsemblingModel(estimator_list,
+                                                    config=self.config)
         return best_estimator_
 
     @staticmethod
     def extract_data_info(raw_x):
         """
-        Extracts the data info automatically based on the type of each feature in raw_x.
+        Extracts the data info automatically based on the type of each feature in
+        raw_x.
         # Arguments
             raw_x: a numpy.ndarray instance containing the training data.
         # Returns
@@ -365,36 +410,55 @@ class Classifier(AutoKaggle):
             verbose: Bool. Defines the verbosity of the logging.
             time_limit: Int. Time budget for performing search and fit pipeline.
             use_ensembling: Bool. Defines whether to use an ensemble of models
-            num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble
+            num_estimators_ensemble: Int. Maximum number of estimators to be used
+            in an ensemble
             ensemble_strategy: String. Strategy to ensemble models
-            ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling
-            random_ensemble: Bool. Whether the ensembling estimators are picked randomly.
-            diverse_ensemble: Bool. Whether estimators from different families are picked.
-            ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search
+            ensemble_method: String. Aggregation method if ensemble_strategy is
+            set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked
+            randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are
+            picked.
+            ensembling_search_iter: Int. Search iterations for ensembling
+            hyper-parameter search
             search_algo: String. Search strategy for hyper-parameter search.
             search_iter: Int. Number of iterations used for hyper-parameter search.
             cv_folds: Int. Number of Cross Validation folds.
-            subsample_ratio: Percent of subsample used for for hyper-parameter search.
+            subsample_ratio: Percent of subsample used for for hyper-parameter
+            search.
             data_info: list(String). Lists the datatypes of each feature column.
-            stack_probabilities: Bool. Whether to use class probabilities in ensembling.
+            stack_probabilities: Bool. Whether to use class probabilities in
+            ensembling.
             upsample_classes: Bool. Whether to upsample less represented classes
             num_p_hparams: Int. Number of preprocessor search spaces.
     """
+
     def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
-                 num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting',
-                 search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
-                 stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking',
+                 ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1,
+                 random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False,
+                 ensembling_search_iter=10,
                  search_algo='random', num_p_hparams=10):
-        super().__init__(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling,
-                         num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy,
-                         ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds,
-                         subsample_ratio=subsample_ratio, random_ensemble=random_ensemble, diverse_ensemble=diverse_ensemble,
-                         stack_probabilities=stack_probabilities, data_info=data_info,
-                         upsample_classes=upsample_classes, ensembling_search_iter=ensembling_search_iter,
+        super().__init__(path=path, verbose=verbose, time_limit=time_limit,
+                         use_ensembling=use_ensembling,
+                         num_estimators_ensemble=num_estimators_ensemble,
+                         ensemble_strategy=ensemble_strategy,
+                         ensemble_method=ensemble_method, search_iter=search_iter,
+                         cv_folds=cv_folds,
+                         subsample_ratio=subsample_ratio,
+                         random_ensemble=random_ensemble,
+                         diverse_ensemble=diverse_ensemble,
+                         stack_probabilities=stack_probabilities,
+                         data_info=data_info,
+                         upsample_classes=upsample_classes,
+                         ensembling_search_iter=ensembling_search_iter,
                          search_algo=search_algo, num_p_hparams=num_p_hparams)
         self.config.objective = 'classification'
-        self.m_hparams = hp.choice('classifier', [CLASSIFICATION_HPARAM_SPACE[m] for m in
-                                                  self.config.classification_models])
+        self.m_hparams = hp.choice('classifier',
+                                   [CLASSIFICATION_HPARAM_SPACE[m] for m in
+                                    self.config.classification_models])
         self.m_hparams_base = hp.choice('classifier',
                                         [CLASSIFICATION_BASE_HPARAM_SPACE[m] for m in
                                          self.config.classification_models])
@@ -406,10 +470,12 @@ def get_skf(self, folds):
         """
         if self.config.objective == 'binary':
             score_metric = 'roc_auc'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+            skf = StratifiedKFold(n_splits=folds, shuffle=True,
+                                  random_state=self.config.random_state)
         else:
             score_metric = 'f1_weighted'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+            skf = StratifiedKFold(n_splits=folds, shuffle=True,
+                                  random_state=self.config.random_state)
         return score_metric, skf
 
 
@@ -422,63 +488,89 @@ class Regressor(AutoKaggle):
             verbose: Bool. Defines the verbosity of the logging.
             time_limit: Int. Time budget for performing search and fit pipeline.
             use_ensembling: Bool. Defines whether to use an ensemble of models
-            num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble
+            num_estimators_ensemble: Int. Maximum number of estimators to be used
+            in an ensemble
             ensemble_strategy: String. Strategy to ensemble models
-            ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling
-            random_ensemble: Bool. Whether the ensembling estimators are picked randomly.
-            diverse_ensemble: Bool. Whether estimators from different families are picked.
-            ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search
+            ensemble_method: String. Aggregation method if ensemble_strategy is
+            set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked
+            randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are
+            picked.
+            ensembling_search_iter: Int. Search iterations for ensembling
+            hyper-parameter search
             search_algo: String. Search strategy for hyper-parameter search.
             search_iter: Int. Number of iterations used for hyper-parameter search.
             cv_folds: Int. Number of Cross Validation folds.
-            subsample_ratio: Percent of subsample used for for hyper-parameter search.
+            subsample_ratio: Percent of subsample used for for hyper-parameter
+            search.
             data_info: list(String). Lists the datatypes of each feature column.
-            stack_probabilities: Bool. Whether to use class probabilities in ensembling.
+            stack_probabilities: Bool. Whether to use class probabilities in
+            ensembling.
             upsample_classes: Bool. Whether to upsample less represented classes
             num_p_hparams: Int. Number of preprocessor search spaces.
     """
+
     def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
-                 num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting',
-                 search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
-                 stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking',
+                 ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1,
+                 random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False,
+                 ensembling_search_iter=10,
                  search_algo='random', num_p_hparams=10):
-        super().__init__(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling,
-                         num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy,
-                         ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds,
-                         subsample_ratio=subsample_ratio, random_ensemble=random_ensemble,
+        super().__init__(path=path, verbose=verbose, time_limit=time_limit,
+                         use_ensembling=use_ensembling,
+                         num_estimators_ensemble=num_estimators_ensemble,
+                         ensemble_strategy=ensemble_strategy,
+                         ensemble_method=ensemble_method, search_iter=search_iter,
+                         cv_folds=cv_folds,
+                         subsample_ratio=subsample_ratio,
+                         random_ensemble=random_ensemble,
                          diverse_ensemble=diverse_ensemble,
-                         stack_probabilities=stack_probabilities, data_info=data_info,
-                         upsample_classes=upsample_classes, ensembling_search_iter=ensembling_search_iter,
+                         stack_probabilities=stack_probabilities,
+                         data_info=data_info,
+                         upsample_classes=upsample_classes,
+                         ensembling_search_iter=ensembling_search_iter,
                          search_algo=search_algo, num_p_hparams=num_p_hparams)
         self.config.objective = 'regression'
-        self.m_hparams = hp.choice('regressor', [REGRESSION_HPARAM_SPACE[m] for m in self.config.regression_models])
+        self.m_hparams = hp.choice('regressor', [REGRESSION_HPARAM_SPACE[m] for m in
+                                                 self.config.regression_models])
         self.m_hparams_base = hp.choice('regressor',
-                                        [REGRESSION_BASE_HPARAM_SPACE[m] for m in self.config.classification_models])
+                                        [REGRESSION_BASE_HPARAM_SPACE[m] for m in
+                                         self.config.classification_models])
         self.p_hparams_base = REGRESSION_PREP_HPARAM_SPACE
 
     def get_skf(self, folds):
         """
             See the base class.
         """
-        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True,
+                                               random_state=self.config.random_state)
 
 
 class AutoPipe(BaseEstimator):
     """ Implements a machine learning pipeline.
 
-        Implements a machine learning pipeline with preprocessor and estimator. A user can call fit(), and predict()
-        methods on it. It is used as a search unit in AutoKaggle's hyeper-parameter search.
+        Implements a machine learning pipeline with preprocessor and estimator. A
+        user can call fit(), and predict() methods on it. It is used as a  search
+        unit in AutoKaggle's hyeper-parameter search.
         # Arguments
-            config: Config. Defines the configuration of various components of the pipeline.
+            config: Config. Defines the configuration of various components of the
+            pipeline.
             m_params: Dict. Hyper-parameter search space for estimator.
             p_params: Dict. Hyper-parameter search space for preprocessor.
             model_class: Estimator. Class name of the estimator used in the pipeline.
-            _estimator_type: String. Denotes if the estimator is 'classifier' or 'regressor'
-            prep: Preprocessor. Instance of the Preprocessor class, which does basic feature preprocessing and feature
+            _estimator_type: String. Denotes if the estimator is 'classifier' or
+            'regressor'
+            prep: Preprocessor. Instance of the Preprocessor class, which does
+            basic feature preprocessing and feature
             engineering
-            model: Estimator. Instance of the estimator class which learns a machine learning model and predicts on the
+            model: Estimator. Instance of the estimator class which learns a
+            machine learning model and predicts on the
             given data.
     """
+
     def __init__(self, model_class, m_params, p_params, config):
         self.prep = None
         self.model = None
@@ -486,7 +578,8 @@ def __init__(self, model_class, m_params, p_params, config):
         self.m_params = m_params
         self.p_params = p_params
         self.model_class = model_class
-        self._estimator_type = 'classifier' if is_classifier(model_class) else 'regressor'
+        self._estimator_type = 'classifier' if is_classifier(
+            model_class) else 'regressor'
 
     def fit(self, x, y):
         """ Trains the given pipeline.
@@ -513,7 +606,8 @@ def predict(self, x):
         return self.model.predict(x)
 
     def predict_proba(self, x):
-        """ Predict label probabilities on the test data for the given classification task.
+        """ Predict label probabilities on the test data for the given
+        classification task.
         # Arguments
             x: A numpy.ndarray instance containing the test data.
         # Returns
diff --git a/autokaggle/config.py b/autokaggle/config.py
index 87a4f30..729c156 100644
--- a/autokaggle/config.py
+++ b/autokaggle/config.py
@@ -4,7 +4,8 @@
 from hyperopt import hp
 from sklearn.svm import SVC
 from sklearn.neighbors import KNeighborsClassifier
-from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor,\
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
+    RandomForestRegressor, AdaBoostRegressor, \
     ExtraTreesRegressor
 from sklearn.linear_model import Ridge
 from lightgbm import LGBMClassifier, LGBMRegressor
@@ -15,33 +16,45 @@
 class Config:
     """ Configuration for various autoML components.
 
-        Defines the common configuration of different auto ML components. It is shared between AutoKaggle, AutoPipe,
-        Preprocessor and Ensembling class.
+        Defines the common configuration of different auto ML components. It is
+        shared between AutoKaggle, AutoPipe, Preprocessor and Ensembling class.
 
         # Arguments
             path: String. OS path for storing temporary model parameters.
             verbose: Bool. Defines the verbosity of the logging.
             time_limit: Int. Time budget for performing search and fit pipeline.
             use_ensembling: Bool. Defines whether to use an ensemble of models
-            num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble
+            num_estimators_ensemble: Int. Maximum number of estimators to be used
+            in an ensemble
             ensemble_strategy: String. Strategy to ensemble models
-            ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling
-            random_ensemble: Bool. Whether the ensembling estimators are picked randomly.
-            diverse_ensemble: Bool. Whether estimators from different families are picked.
-            ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search
+            ensemble_method: String. Aggregation method if ensemble_strategy is
+            set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked
+            randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are
+            picked.
+            ensembling_search_iter: Int. Search iterations for ensembling
+            hyper-parameter search
             search_algo: String. Search strategy for hyper-parameter search.
             search_iter: Int. Number of iterations used for hyper-parameter search.
             cv_folds: Int. Number of Cross Validation folds.
-            subsample_ratio: Percent of subsample used for for hyper-parameter search.
+            subsample_ratio: Percent of subsample used for for hyper-parameter
+            search.
             data_info: list(String). Lists the datatypes of each feature column.
-            stack_probabilities: Bool. Whether to use class probabilities in ensembling.
+            stack_probabilities: Bool. Whether to use class probabilities in
+            ensembling.
             upsample_classes: Bool. Whether to upsample less represented classes
             num_p_hparams: Int. Number of preprocessor search spaces.
     """
-    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50,
-                 ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3,
-                 subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False,
-                 data_info=None, upsample_classes=False, ensembling_search_iter=10, search_algo='random',
+
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
+                 num_estimators_ensemble=50,
+                 ensemble_strategy='stacking', ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3,
+                 subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False,
+                 data_info=None, upsample_classes=False, ensembling_search_iter=10,
+                 search_algo='random',
                  num_p_hparams=10):
         self.verbose = verbose
         self.path = path if path is not None else rand_temp_folder_generator()
@@ -61,15 +74,18 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True
         self.subsample_ratio = subsample_ratio
         self.resampling_strategy = 'auto'
         self.random_state = 1001
-        self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest', 'adaboost']
+        self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest',
+                                      'adaboost']
         # self.classification_models = ['knn', 'lgbm', 'random_forest',]
-        self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost', 'catboost']
+        self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest',
+                                  'adaboost', 'catboost']
         self.diverse_ensemble = diverse_ensemble
         self.stack_probabilities = stack_probabilities
         self.data_info = data_info
         self.upsample_classes = upsample_classes
         self.ensembling_search_iter = ensembling_search_iter
-        self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else hyperopt.tpe.suggest
+        self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else \
+            hyperopt.tpe.suggest
         self.num_p_hparams = num_p_hparams
 
     def update(self, options):
@@ -81,7 +97,8 @@ def update(self, options):
 KNN_CLASSIFIER_PARAMS = {
     'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]),
     'weights': hp.choice('weight_knn', ['uniform', 'distance']),
-    'metric': hp.choice('metric_knn', ["euclidean", "manhattan", "chebyshev", "minkowski"]),
+    'metric': hp.choice('metric_knn',
+                        ["euclidean", "manhattan", "chebyshev", "minkowski"]),
     'p': hp.choice('p_knn', range(1, 3)),
 }
 
@@ -110,20 +127,24 @@ def update(self, options):
     'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)),
     'max_depth': hp.choice('max_depth_lgbm', range(5, 10)),
     'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)),
-    'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2), high=np.log(2)),
+    'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2),
+                                   high=np.log(2)),
 }
 
 ADABOOST_CLASSIFIER_PARAMS = {
     'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
     'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)),
-    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)),
+    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2),
+                                   high=np.log(2)),
 }
 
 CATBOOST_CLASSIFIER_PARAMS = {
     'iterations': hp.choice('iterations_catboost', [5, 10]),
     'depth': hp.choice('depth_catboost', range(4, 11)),
-    'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), high=np.log(1)),
-    'loss_function': hp.choice('loss_function_catboost', ['Logloss', 'CrossEntropy']),
+    'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3),
+                                   high=np.log(1)),
+    'loss_function': hp.choice('loss_function_catboost',
+                               ['Logloss', 'CrossEntropy']),
     'verbose': True,
     'leaf_estimation_iterations': 10,
     'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3))
@@ -163,13 +184,15 @@ def update(self, options):
     'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)),
     'max_depth': hp.choice('max_depth_lgbm', range(5, 10)),
     'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)),
-    'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5), high=np.log(1)),
+    'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5),
+                                   high=np.log(1)),
 }
 
 ADABOOST_REGRESSOR_PARAMS = {
     'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
     'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)),
-    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)),
+    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2),
+                                   high=np.log(2)),
     # 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)),
 }
 
@@ -181,7 +204,6 @@ def update(self, options):
     'verbose': True
 }
 
-
 REGRESSION_HPARAM_SPACE = {
     'extratree': {
         'model': ExtraTreesRegressor,
@@ -202,14 +224,13 @@ def update(self, options):
     'adaboost': {
         'model': AdaBoostRegressor,
         'param': ADABOOST_REGRESSOR_PARAMS
-     },
+    },
     'catboost': {
         'model': CatBoostRegressor,
         'param': CATBOOST_REGRESSOR_PARAMS
     }
 }
 
-
 CLASSIFICATION_HPARAM_SPACE = {
     'knn': {
         'model': KNeighborsClassifier,
@@ -284,7 +305,7 @@ def update(self, options):
     'adaboost': {
         'model': AdaBoostRegressor,
         'param': {}
-     },
+    },
     'catboost': {
         'model': CatBoostRegressor,
         'param': {}
@@ -292,7 +313,8 @@ def update(self, options):
 }
 
 REGRESSION_PREP_HPARAM_SPACE = {
-    'cat_encoding': hp.choice('cat_enc', ['count', 'target+count', 'target+label', 'label']),
+    'cat_encoding': hp.choice('cat_enc',
+                              ['count', 'target+count', 'target+label', 'label']),
     'scaling': hp.choice('scaling', [True, False]),
     'log_transform': hp.choice('log_transform', [True, False]),
     'power_transform': hp.choice('power_transform', [True, False]),
@@ -300,15 +322,18 @@ def update(self, options):
     'binning': hp.choice('binning', [True, False]),
     'add_time_offset': hp.choice('add_time_offset', [True, False]),
     'add_time_diff': hp.choice('add_time_diff', [True, False]),
-    # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', 'min', None]),
+    # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max',
+    # 'min', None]),
     # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]),
-    'imputation_strategy': hp.choice('imputation_strategy', ['most_frequent', 'zero']),
+    'imputation_strategy': hp.choice('imputation_strategy',
+                                     ['most_frequent', 'zero']),
     'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01),
     'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
 }
 
 CLASSIFICATION_PREP_HPARAM_SPACE = {
-    'cat_encoding': hp.choice('cat_enc', ['target', 'count', 'target+count', 'target+label']),
+    'cat_encoding': hp.choice('cat_enc',
+                              ['target', 'count', 'target+count', 'target+label']),
     'scaling': hp.choice('scaling', [True, False]),
     'log_transform': hp.choice('log_transform', [True, False]),
     'power_transform': hp.choice('power_transform', [True, False]),
@@ -316,9 +341,11 @@ def update(self, options):
     'binning': hp.choice('binning', [True, False]),
     'add_time_offset': hp.choice('add_time_offset', [True, False]),
     'add_time_diff': hp.choice('add_time_diff', [True, False]),
-    # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', 'min', None]),
+    # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max',
+    # 'min', None]),
     # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]),
-    'imputation_strategy': hp.choice('imputation_strategy', ['most_frequent', 'zero']),
+    'imputation_strategy': hp.choice('imputation_strategy',
+                                     ['most_frequent', 'zero']),
     'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01),
     'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
 }
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
index fa394d2..95d0435 100644
--- a/autokaggle/ensemblers.py
+++ b/autokaggle/ensemblers.py
@@ -1,5 +1,6 @@
 from sklearn.base import BaseEstimator
-from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, \
+    read_json
 from abc import abstractmethod
 import numpy as np
 import os
@@ -17,26 +18,31 @@
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score
 import hyperopt
 from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
-from autokaggle.config import REGRESSION_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE,\
+from autokaggle.config import REGRESSION_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, \
+    CLASSIFICATION_BASE_HPARAM_SPACE, \
     REGRESSION_BASE_HPARAM_SPACE
 
 
 class EnsemblingModel:
     """ Base class for ensembling estimators.
 
-        This class creates an ensembling estimator from a given list of estimators. The user can call fit() and
-        predict() methods, similar to the scikit-learn estimators.
+        This class creates an ensembling estimator from a given list of estimators.
+        The user can call fit() and predict() methods, similar to the scikit-learn
+        estimators.
 
         # Arguments
-            config: Config. Defines the configuration of various components of the autoML pipeline.
-            estimator_list: List. List of the estimators, to be used for building an ensemble.
+            config: Config. Defines the configuration of various components of the
+            autoML pipeline.
+            estimator_list: List. List of the estimators, to be used for building an
+            ensemble.
     """
+
     def __init__(self, estimator_list, config):
         self.config = config
         self.estimator_list = estimator_list
 
     @abstractmethod
-    def fit(self, X, y):
+    def fit(self, x, y):
         """ Trains the ensemble of estimators on the training data.
         # Arguments
             X: A numpy array instance containing the training data.
@@ -46,13 +52,14 @@ def fit(self, X, y):
         pass
 
     @abstractmethod
-    def predict(self, X):
+    def predict(self, x):
         """ Generate prediction on the test data for the given task.
         # Arguments
             X: A numpy array instance containing the test data.
         # Returns
             A numpy array for the predictions on the x_test.
-        This function provides predicts on the input data using the ensemble of estimators.
+        This function provides predicts on the input data using the ensemble of
+        estimators.
         """
         pass
 
@@ -60,17 +67,18 @@ def predict(self, X):
 class RankedEnsemblingModel(EnsemblingModel):
     """ Implements ensembling using ranking based methods.
 
-        This class implements randing based ensembling using ensembling methods amongst: ('mean', 'median', 'max' and
-        'majority_voting')
+        This class implements randing based ensembling using ensembling methods
+        amongst: ('mean', 'median', 'max' and 'majority_voting')
     """
-    def fit(self, X, y):
+
+    def fit(self, x, y):
         for est in self.estimator_list:
-            est.fit(X, y)
-    
-    def predict(self, X):
-        predictions = np.zeros((len(X), len(self.estimator_list)))
+            est.fit(x, y)
+
+    def predict(self, x):
+        predictions = np.zeros((len(x), len(self.estimator_list)))
         for i, est in enumerate(self.estimator_list):
-            predictions[:, i] = est.predict(X)
+            predictions[:, i] = est.predict(x)
 
         if self.config.ensemble_method == 'median':
             return np.median(predictions, axis=1)
@@ -87,21 +95,25 @@ def predict(self, X):
 class StackedEnsemblingModel(EnsemblingModel):
     """ Implements a stacking based ensembling estimator.
 
-        This class creates an ensembling estimator using stacking. It trains an Light-GBM model on the predictions of
-        thh base estimator.
+        This class creates an ensembling estimator using stacking. It trains an
+        Light-GBM model on the predictions of the base estimator.
 
         # Arguments
-            stacking_estimator: LightGBM estimator. Meta-learning algorithm for the stacking estimator.
+            stacking_estimator: LightGBM estimator. Meta-learning algorithm for the
+            stacking estimator.
     """
+
     def __init__(self, estimator_list, config):
         super().__init__(estimator_list, config)
         self.stacking_estimator = None
 
         if self.config.objective == 'regression':
-            self.hparams = hp.choice('regressor', [REGRESSION_BASE_HPARAM_SPACE['lgbm']])
+            self.hparams = hp.choice('regressor',
+                                     [REGRESSION_BASE_HPARAM_SPACE['lgbm']])
             self.config.stack_probabilities = False
         else:
-            self.hparams = hp.choice('classifier', [CLASSIFICATION_BASE_HPARAM_SPACE['lgbm']])
+            self.hparams = hp.choice('classifier',
+                                     [CLASSIFICATION_BASE_HPARAM_SPACE['lgbm']])
 
     def get_model_predictions(self, X):
         """ Generate the combined predictions from the list of the estimators.
@@ -126,8 +138,8 @@ def get_model_predictions(self, X):
                 predictions[:, i] = est.predict(X)
         return predictions
 
-    def fit(self, X, y):
-        x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
+    def fit(self, x, y):
+        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
         for est in self.estimator_list:
             est.fit(x_train, y_train)
         predictions = self.get_model_predictions(x_val)
@@ -146,7 +158,8 @@ def search(self, x, y):
         def objective_func(args):
             clf = args['model'](**args['param'])
             try:
-                eval_score = cross_val_score(clf, x, y, scoring=score_metric, cv=skf).mean()
+                eval_score = cross_val_score(clf, x, y, scoring=score_metric,
+                                             cv=skf).mean()
             except ValueError:
                 eval_score = 0
             if self.config.verbose:
@@ -155,8 +168,10 @@ def objective_func(args):
             return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
 
         trials = Trials()
-        best = fmin(objective_func, self.hparams, algo=self.config.search_algo, trials=trials,
-                    max_evals=self.config.ensembling_search_iter, rstate=np.random.RandomState(self.config.random_state))
+        best = fmin(objective_func, self.hparams, algo=self.config.search_algo,
+                    trials=trials,
+                    max_evals=self.config.ensembling_search_iter,
+                    rstate=np.random.RandomState(self.config.random_state))
 
         opt = space_eval(self.hparams, best)
         best_estimator_ = opt['model'](**opt['param'])
@@ -165,8 +180,8 @@ def objective_func(args):
             print(opt)
         return best_estimator_
 
-    def predict(self, X):
-        predictions = self.get_model_predictions(X)
+    def predict(self, x):
+        predictions = self.get_model_predictions(x)
         return self.stacking_estimator.predict(predictions)
 
     def get_skf(self, folds):
@@ -178,13 +193,16 @@ def get_skf(self, folds):
         """
         if self.config.objective == 'binary':
             score_metric = 'roc_auc'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+            skf = StratifiedKFold(n_splits=folds, shuffle=True,
+                                  random_state=self.config.random_state)
         elif self.config.objective == 'multiclass':
             score_metric = 'f1_weighted'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+            skf = StratifiedKFold(n_splits=folds, shuffle=True,
+                                  random_state=self.config.random_state)
         elif self.config.objective == 'regression':
             score_metric = 'neg_mean_squared_error'
-            skf = KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state)
+            skf = KFold(n_splits=folds, shuffle=True,
+                        random_state=self.config.random_state)
         else:
             ValueError("Invalid objective")
         return score_metric, skf
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
index d286dd9..f5ba361 100644
--- a/autokaggle/preprocessor.py
+++ b/autokaggle/preprocessor.py
@@ -4,7 +4,8 @@
 import itertools
 from scipy.stats import pearsonr
 from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, PowerTransformer, \
+    KBinsDiscretizer, OneHotEncoder
 from sklearn.base import TransformerMixin
 from sklearn.base import BaseEstimator
 from sklearn.impute import SimpleImputer
@@ -12,20 +13,25 @@
 from abc import abstractmethod
 import collections
 from lightgbm import LGBMClassifier, LGBMRegressor
+
 LEVEL_HIGH = 32
 
 
 class Preprocessor(TransformerMixin):
     """ Implements basic preprocessing and feature engineering class.
 
-        Preprocessor takes care of the basic preprocessing and feature engineering of the input data. Similar to
-        Scikit-learn transformers, it implements the fit() and transform() methods. TO acheive this It applies various
-        feature primitives in a sequence using scikit-learn pipeline.
+        Preprocessor takes care of the basic preprocessing and feature engineering of
+        the input data. Similar to Scikit-learn transformers,it implements the fit()
+        and transform() methods. TO acheive this It applies various feature
+        primitives in a sequence using scikit-learn pipeline.
         # Arguments
-            config: Config. Defines the configuration of various components of the autoML pipeline.
+            config: Config. Defines the configuration of various components of the
+            AutoML pipeline.
             params: Dict. Hyper-parameter search space for preprocessor.
-            pipeline: Pipeline. Sci-kit learn pipeline class to apply the feature primitives in sequence
+            pipeline: Pipeline. Sci-kit learn pipeline class to apply the feature
+            primitives in sequence
     """
+
     def __init__(self, config, params):
         self.config = config
         self.params = params
@@ -61,8 +67,8 @@ def transform(self, raw_x):
             raw_x: a numpy array instance containing the training/testing data
         # Returns
             A numpy array instance containing the transformed data.
-        This function provides transforms the input data by applying the transformations using the pre-trained
-        preprocessor chain.
+        This function provides transforms the input data by applying the
+        transformations using the pre-trained preprocessor chain.
         """
         # Get Meta-Feature
         data = TabularData(raw_x, self.config.data_info, self.config.verbose)
@@ -80,19 +86,30 @@ def get_categorical_pipeline(params):
         choice = params.get('cat_encoding', 'target')
         cat_pipeline = []
         if choice == 'target':
-            cat_pipeline.append(('target_encoder', TargetEncoder(operation='upd', selected_type='CAT')))
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='upd',
+                                                                 selected_type='CAT')
+                                 ))
         elif choice == 'label':
-            cat_pipeline.append(('label_encoder', LabelEncode(operation='upd', selected_type='CAT')))
+            cat_pipeline.append(
+                ('label_encoder', LabelEncode(operation='upd', selected_type='CAT')))
         elif choice == 'count':
-            cat_pipeline.append(('count_encoder', CatCount(operation='upd', selected_type='CAT')))
+            cat_pipeline.append(
+                ('count_encoder', CatCount(operation='upd', selected_type='CAT')))
         elif choice == 'target+count':
-            cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', selected_type='CAT')))
-            cat_pipeline.append(('count_encoder', CatCount(operation='upd', selected_type='CAT')))
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='add',
+                                                                 selected_type='CAT')
+                                 ))
+            cat_pipeline.append(
+                ('count_encoder', CatCount(operation='upd', selected_type='CAT')))
         elif choice == 'one_hot':
-            cat_pipeline.append(('one_hot_encoder', OneHot(operation='upd', selected_type='CAT')))
+            cat_pipeline.append(
+                ('one_hot_encoder', OneHot(operation='upd', selected_type='CAT')))
         elif choice == 'target+label':
-            cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', selected_type='CAT')))
-            cat_pipeline.append(('label_encoder', LabelEncode(operation='upd', selected_type='CAT')))
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='add',
+                                                                 selected_type='CAT')
+                                 ))
+            cat_pipeline.append(
+                ('label_encoder', LabelEncode(operation='upd', selected_type='CAT')))
         else:
             raise ValueError
         return cat_pipeline
@@ -113,15 +130,21 @@ def get_numerical_pipeline(params):
 
         numeric_pipeline = []
         if scaling:
-            numeric_pipeline.append(('scaler', TabScaler(operation='upd', selected_type='NUM')))
+            numeric_pipeline.append(
+                ('scaler', TabScaler(operation='upd', selected_type='NUM')))
         if log_transform:
-            numeric_pipeline.append(('log_transform', LogTransform(operation='upd', selected_type='NUM')))
+            numeric_pipeline.append(('log_transform',
+                                     LogTransform(operation='upd',
+                                                  selected_type='NUM')))
         if power_transform:
-            numeric_pipeline.append(('boxcox', BoxCox(operation='upd', selected_type='NUM')))
+            numeric_pipeline.append(
+                ('boxcox', BoxCox(operation='upd', selected_type='NUM')))
         if pca:
-            numeric_pipeline.append(('pca', TabPCA(operation='add', selected_type='NUM')))
+            numeric_pipeline.append(
+                ('pca', TabPCA(operation='add', selected_type='NUM')))
         if binning:
-            numeric_pipeline.append(('binning', Binning(operation='add', selected_type='NUM')))
+            numeric_pipeline.append(
+                ('binning', Binning(operation='add', selected_type='NUM')))
         return numeric_pipeline
 
     def get_filtering_pipeline(self, params):
@@ -134,15 +157,19 @@ def get_filtering_pipeline(self, params):
         pearson_thresh = params.get('pearson_thresh', 0)
         feat_importance_thresh = params.get('feat_importance_thresh', 0)
 
-        filter_pipeline = [('filter', FilterConstant(operation='del', selected_type='ALL'))]
+        filter_pipeline = [
+            ('filter', FilterConstant(operation='del', selected_type='ALL'))]
         if pearson_thresh > 0:
-            filter_pipeline.append(('pearson_corr', FeatureFilter(operation='del', selected_type='ALL',
-                                                                  threshold=pearson_thresh)))
+            filter_pipeline.append(
+                ('pearson_corr', FeatureFilter(operation='del', selected_type='ALL',
+                                               threshold=pearson_thresh)))
         if feat_importance_thresh > 0:
-            filter_pipeline.append(('lgbm_feat_selection', FeatureImportance(operation='del',
-                                                                             selected_type='ALL',
-                                                                             threshold=feat_importance_thresh,
-                                                                             task_type=self.config.objective)))
+            filter_pipeline.append(
+                ('lgbm_feat_selection',
+                 FeatureImportance(operation='del',
+                                   selected_type='ALL',
+                                   threshold=feat_importance_thresh,
+                                   task_type=self.config.objective)))
         return filter_pipeline
 
     @staticmethod
@@ -157,9 +184,11 @@ def get_time_pipeline(params):
         add_diff = params.get('add_time_diff', False)
         time_pipeline = []
         if add_offset:
-            time_pipeline.append(('time_offset', TimeOffset(operation='upd', selected_type='TIME')))
+            time_pipeline.append(
+                ('time_offset', TimeOffset(operation='upd', selected_type='TIME')))
         if add_diff:
-            time_pipeline.append(('time_diff', TimeDiff(operation='add', selected_type='TIME')))
+            time_pipeline.append(
+                ('time_diff', TimeDiff(operation='add', selected_type='TIME')))
         return time_pipeline
 
     @staticmethod
@@ -171,7 +200,9 @@ def get_imputation_pipeline(params):
             List of primitives to be applied (based on the given setting)
         """
         strategy = params.get('imputation_strategy', 'most_frequent')
-        impute_pipeline = [('imputer', Imputation(operation='upd', selected_type='ALL', strategy=strategy))]
+        impute_pipeline = [('imputer',
+                            Imputation(operation='upd', selected_type='ALL',
+                                       strategy=strategy))]
         return impute_pipeline
 
     @staticmethod
@@ -186,22 +217,27 @@ def get_higher_order_pipeline(params):
         cat_cat_strategy = params.get('cat_cat_strategy', None)
         pipeline = []
         if cat_num_strategy:
-            pipeline.append(('cat_num_encoder', CatNumEncoder(operation='add', selected_type1='CAT',
-                                                              selected_type2='NUM', strategy=cat_num_strategy)))
+            pipeline.append(('cat_num_encoder',
+                             CatNumEncoder(operation='add', selected_type1='CAT',
+                                           selected_type2='NUM',
+                                           strategy=cat_num_strategy)))
         if cat_cat_strategy:
-            pipeline.append(('cat_cat_encoder', CatCatEncoder(operation='add', selected_type1='CAT',
-                                                              selected_type2='CAT', strategy=cat_cat_strategy)))
+            pipeline.append(('cat_cat_encoder',
+                             CatCatEncoder(operation='add', selected_type1='CAT',
+                                           selected_type2='CAT',
+                                           strategy=cat_cat_strategy)))
         return pipeline
 
 
 class TabularData:
     """ Represents the data and its meta-info.
 
-        TabularData includes the training/testing data along with its meta info such as data types, cardinality etc. The
-        user can update the data and its meta info as well as select the features matching the criteria.
+        TabularData includes the training/testing data along with its meta info such
+        as data types, cardinality etc. The user can update the data and its meta
+        info as well as select the features matching the criteria.
         # Arguments
             verbose: Bool. Determines the verbosity of the logging.
-            data_info: Dict. Dictionary mapping the feature names to their data_types.
+            data_info: Dict. Dictionary mapping the feature names to their data_types
             total_samples: Int. Number of samples in the data
             cat_col: List. List of the categorical features
             num_col: List. List of the numerical features
@@ -209,11 +245,14 @@ class TabularData:
             n_cat: Int. Number of categorical features
             n_num: Int. Number of numerical features
             n_time: Int. Number of time features
-            cat_cardinality: Dict. Dictionary mapping categorical feature names of their cardinality (no. of unique
-            values)
-            generated_features: List. List of the newly added features. (In addition to the pre-existing columns)
-            num_info: Dict. Dictionary mapping numeircal column to their meta info such as range, std etc.
+            cat_cardinality: Dict. Dictionary mapping categorical feature names of
+            their cardinality (no. of unique values)
+            generated_features: List. List of the newly added features. (In
+            addition to the pre-existing columns)
+            num_info: Dict. Dictionary mapping numeircal column to their meta info
+            such as range, std etc.
     """
+
     def __init__(self, raw_x, data_info, verbose=True):
         self.cat_col = None
         self.num_col = None
@@ -230,11 +269,13 @@ def __init__(self, raw_x, data_info, verbose=True):
         self.refresh_col_types()
 
         # Convert sparse to dense if needed
-        raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x
+        raw_x = raw_x.toarray() if type(
+            raw_x) == scipy.sparse.csr.csr_matrix else raw_x
 
         # To pandas Dataframe
         if type(raw_x) != pd.DataFrame:
-            raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])])
+            raw_x = pd.DataFrame(raw_x,
+                                 columns=[str(i) for i in range(raw_x.shape[1])])
 
         self.X = raw_x
         # self.update_cat_cardinality()
@@ -267,14 +308,17 @@ def rename_cols(self, key):
         # Arguments
             key: Identifier for renaming
         # Returns
-            Renaming function which takes current column name and outputs a new unique column name.
+            Renaming function which takes current column name and outputs a new
+            unique column name.
         """
+
         def rename_fn(col_name):
             col_name = str(col_name)
             col_name += '_' + key
             while col_name in self.X.columns:
                 col_name += '_' + key
             return col_name
+
         return rename_fn
 
     def update(self, operation, columns, x_tr, new_type=None, key=''):
@@ -287,7 +331,8 @@ def update(self, operation, columns, x_tr, new_type=None, key=''):
             key: Name key for renaming the new columns
         # Returns
             None
-        This function takes the transformed (or generated) features after applying the primitive and updates the
+        This function takes the transformed (or generated) features after applying
+        the primitive and updates the
         TabularData.
         """
         if operation == 'upd':
@@ -314,7 +359,8 @@ def refresh_col_types(self):
             None
         # Returns
             None
-        This function updates the cat, num and time column lists based on (any) updates in the data_info.
+        This function updates the cat, num and time column lists based on (any)
+        updates in the data_info.
         """
         self.cat_col = [k for k, v in self.data_info.items() if v == 'CAT']
         self.num_col = [k for k, v in self.data_info.items() if v == 'NUM']
@@ -339,7 +385,8 @@ def update_cat_cardinality(self):
     def select_columns(self, data_type):
         """ Returns all the columns matching the input data_type
         # Arguments
-            data_type: Required type of the data (either of 'CAT', 'NUM', 'TIME' or 'ALL')
+            data_type: Required type of the data (either of 'CAT', 'NUM', 'TIME' or
+            'ALL')
         # Returns
             List of the feature columns matching the input criteria.
         """
@@ -360,18 +407,26 @@ def select_columns(self, data_type):
 class Primitive(BaseEstimator, TransformerMixin):
     """ Base class for the single order data transformation function.
 
-        Primitive learns and applies the data transformation on a given set of features. The user can use fit() and
-        transform() functions to apply these transformations.
+        Primitive learns and applies the data transformation on a given set of
+        features. The user can use fit() and transform() functions to apply these
+        transformations.
 
         # Arguments
             options: Dict. Special arguments specific to the given primitive.
-            selected_type: 'String'. Specifies the type of features the transformation is supposed to be applied to.
-            operation: 'String'. Specifies the type of operation from 'add', 'update' or 'delete'
-            name_key : 'String'. Signature key to rename the column after applying the primitive.
-            selected: 'List'. List of the selected features, on which the transformation will be applied
-            drop_columns: 'List'. List of the features which would be dropped after applying the transformation.
-            supported_ops: Tuple. Specifies the allowed list of operations for this primitive.
+            selected_type: 'String'. Specifies the type of features the
+            transformation is supposed to be applied to.
+            operation: 'String'. Specifies the type of operation from 'add', 'update'
+             or 'delete'
+            name_key : 'String'. Signature key to rename the column after applying
+            the primitive.
+            selected: 'List'. List of the selected features, on which the
+            transformation will be applied
+            drop_columns: 'List'. List of the features which would be dropped after
+            applying the transformation.
+            supported_ops: Tuple. Specifies the allowed list of operations for this
+            primitive.
     """
+
     def __init__(self, operation='upd', selected_type=None, **kwargs):
         self.options = None
         self.selected = None
@@ -383,7 +438,8 @@ def __init__(self, operation='upd', selected_type=None, **kwargs):
         self.name_key = self.__class__.__name__
 
     def init_vars(self, **kwargs):
-        """ Initialize the primitive specific variables (which are not defined in the base class)
+        """ Initialize the primitive specific variables (which are not defined in the
+        base class)
         # Arguments
             kwargs: Dictionary containing primitive specific variables
         # Returns
@@ -392,7 +448,8 @@ def init_vars(self, **kwargs):
         self.options = kwargs
 
     def fit(self, data, y=None):
-        """ A wrapper function to train the given primitive on the input training data.
+        """ A wrapper function to train the given primitive on the input training
+        data.
         # Arguments
             data: A TabularData instance of training data.
             y: A numpy array of the target values.
@@ -401,14 +458,17 @@ def fit(self, data, y=None):
         """
         self.selected = data.select_columns(self.selected_type)
         if self.operation not in self.supported_ops:
-            print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__))
+            print("Operation {} not supported for {}".format(self.operation,
+                                                             self.__class__.__name__)
+                  )
             self.selected = None
         if not self.selected:
             return self
         return self._fit(data, y)
 
     def transform(self, data, y=None):
-        """ A wrapper function to generate transformation on the input data based on pre-trained primitive.
+        """ A wrapper function to generate transformation on the input data based on
+        pre-trained primitive.
         # Arguments
             data: Input training/testing data in TabularData form.
             y: A numpy array of the target values.
@@ -421,7 +481,8 @@ def transform(self, data, y=None):
 
     @abstractmethod
     def _fit(self, data, y=None):
-        """ Contains the actual implementation of training the primitive (implemented in the child class)
+        """ Contains the actual implementation of training the primitive (implemented
+        in the child class)
         # Arguments
             data: A TabularData instance of training data.
             y: A numpy array of the target values.
@@ -432,8 +493,8 @@ def _fit(self, data, y=None):
 
     @abstractmethod
     def _transform(self, data, y=None):
-        """ Contains the actual implementation of transforming the data using primitive. (implemented in the child
-        class)
+        """ Contains the actual implementation of transforming the data using
+        primitive. (implemented in the child class)
         # Arguments
             data: Input training/testing data in TabularData form.
             y: A numpy array of the target values.
@@ -446,25 +507,34 @@ def _transform(self, data, y=None):
 class PrimitiveHigherOrder:
     """ Base class for the cross-order data transformation function.
 
-        PrimitiveHigherOrder learns and applies the data transformation across two sets of features. The user can use
-        fit() and transform() functions to apply these transformations.
+        PrimitiveHigherOrder learns and applies the data transformation across two
+        sets of features. The user can use fit() and transform() functions to
+        apply these transformations.
 
         # Arguments
             options: Dict. Special arguments specific to the given primitive.
-            selected_type1: 'String'. Specifies the first type of features the transformation is supposed to be applied
-            to.
-            selected_type2: 'String'. Specifies the second type of features the transformation is supposed to be applied
-            to.
-            operation: 'String'. Specifies the type of operation from 'add', 'update' or 'delete'
-            name_key : 'String'. Signature key to rename the column after applying the primitive.
-            selected_1: 'List'. List of the selected features in the first set, on which the transformation will be
+            selected_type1: 'String'. Specifies the first type of features the
+            transformation is supposed to be applied to.
+            selected_type2: 'String'. Specifies the second type of features the
+            transformation is supposed to be applied to.
+            operation: 'String'. Specifies the type of operation from 'add', 'update'
+             or 'delete'
+            name_key : 'String'. Signature key to rename the column after applying
+            the primitive.
+            selected_1: 'List'. List of the selected features in the first set, on
+            which the transformation will be
             applied
-            selected_2: 'List'. List of the selected features in the second set, on which the transformation will be
+            selected_2: 'List'. List of the selected features in the second set, on
+            which the transformation will be
             applied
-            drop_columns: 'List'. List of the features which would be dropped after applying the transformation.
-            supported_ops: Tuple. Specifies the allowed list of operations for this primitive.
+            drop_columns: 'List'. List of the features which would be dropped after
+            applying the transformation.
+            supported_ops: Tuple. Specifies the allowed list of operations for this
+            primitive.
     """
-    def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs):
+
+    def __init__(self, operation='upd', selected_type1=None, selected_type2=None,
+                 **kwargs):
         self.options = None
         self.selected_1 = None
         self.selected_2 = None
@@ -477,7 +547,8 @@ def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **
         self.name_key = self.__class__.__name__
 
     def init_vars(self, **kwargs):
-        """ Initialize the primitive specific variables (which are not defined in the base class)
+        """ Initialize the primitive specific variables (which are not defined in the
+        base class)
         # Arguments
             kwargs: Dictionary containing primitive specific variables
         # Returns
@@ -486,7 +557,8 @@ def init_vars(self, **kwargs):
         self.options = kwargs
 
     def fit(self, data, y=None):
-        """ A wrapper function to train the given primitive on the input training data.
+        """ A wrapper function to train the given primitive on the input training
+        data.
         # Arguments
             data: A TabularData instance of training data.
             y: A numpy array of the target values.
@@ -497,7 +569,9 @@ def fit(self, data, y=None):
         self.selected_2 = data.select_columns(self.selected_type2)
 
         if self.operation not in self.supported_ops:
-            print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__))
+            print("Operation {} not supported for {}".format(self.operation,
+                                                             self.__class__.__name__)
+                  )
             self.selected_1 = None
             self.selected_2 = None
         if not self.selected_1 or not self.selected_2:
@@ -505,7 +579,8 @@ def fit(self, data, y=None):
         return self._fit(data, y)
 
     def transform(self, data, y=None):
-        """ A wrapper function to generate transformation on the input data based on pre-trained primitive.
+        """ A wrapper function to generate transformation on the input data based on
+        pre-trained primitive.
         # Arguments
             data: Input training/testing data in TabularData form.
             y: A numpy array of the target values.
@@ -518,7 +593,8 @@ def transform(self, data, y=None):
 
     @abstractmethod
     def _fit(self, data, y=None):
-        """ Contains the actual implementation of training the primitive (implemented in the child class)
+        """ Contains the actual implementation of training the primitive (implemented
+        in the child class)
         # Arguments
             data: A TabularData instance of training data.
             y: A numpy array of the target values.
@@ -529,8 +605,8 @@ def _fit(self, data, y=None):
 
     @abstractmethod
     def _transform(self, data, y=None):
-        """ Contains the actual implementation of transforming the data using primitive. (implemented in the child
-        class)
+        """ Contains the actual implementation of transforming the data using
+        primitive. (implemented in the child class)
         # Arguments
             data: Input training/testing data in TabularData form.
             y: A numpy array of the target values.
@@ -543,7 +619,8 @@ def _transform(self, data, y=None):
 class TabScaler(Primitive):
     """ Standard Scaler primitive.
 
-        TabScaler scales the selected numerical features to have 0 mean and unit variance.
+        TabScaler scales the selected numerical features to have 0 mean and unit
+        variance.
 
         # Arguments
             scaler: StandardScaler. Instance of scikit-learn StandardScaler object
@@ -558,17 +635,20 @@ def _fit(self, data, y=None):
 
     def _transform(self, data, y=None):
         x_tr = self.scaler.transform(data.X[self.selected])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class BoxCox(Primitive):
     """ Power Transform primitive.
 
-        The class applies BoxCox power transformation to make the selected features have normal distribution.
+        The class applies BoxCox power transformation to make the selected features
+        have normal distribution.
 
         # Arguments
-            transformer: PowerTransformer. Instance of scikit-learn PowerTransformer object
+            transformer: PowerTransformer. Instance of scikit-learn PowerTransformer
+            object
     """
     transformer = None
     supported_ops = ('add', 'upd')
@@ -580,20 +660,25 @@ def _fit(self, data, y=None):
 
     def _transform(self, data, y=None):
         x_tr = self.transformer.transform(data.X[self.selected])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class Binning(Primitive):
     """ Numerical binning primitive.
 
-        The class applies divides the given numeric column in the list of buckets, based on the range of their values.
+        The class applies divides the given numeric column in the list of buckets,
+        based on the range of their values.
 
         # Arguments
-            binner: KBinsDiscretizer. Instance of scikit-learn KBinsDiscretizer object
-            strategy: String. Strategy used to define width of the bins. Possible options are: (‘uniform’, ‘quantile’,
+            binner: KBinsDiscretizer. Instance of scikit-learn KBinsDiscretizer
+            object
+            strategy: String. Strategy used to define width of the bins. Possible
+            options are: (‘uniform’, ‘quantile’,
             ‘kmeans’)
-            encoding: String. Method used to encode the transformed result. Possible options are: (‘onehot’,
+            encoding: String. Method used to encode the transformed result. Possible
+            options are: (‘onehot’,
             ‘onehot-dense’, ‘ordinal’)
     """
     binner = None
@@ -612,14 +697,16 @@ def _fit(self, data, y=None):
 
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame(self.binner.transform(data.X[self.selected]))
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class OneHot(Primitive):
     """ One Hot Encoder for categorical features.
 
-        The class applies one hot encoding to categorical features, using the sklearn implementation.
+        The class applies one hot encoding to categorical features, using the
+        sklearn implementation.
 
         # Arguments
             ohe: OneHotEncoder. Instance of scikit-learn OneHotEncoder object
@@ -635,9 +722,11 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame(self.ohe.transform(data.X[self.selected]))
         if self.operation == 'add':
-            data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+            data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                        key=self.name_key)
         elif self.operation == 'upd':
-            data.update('add', self.selected, x_tr, new_type='NUM', key=self.name_key)
+            data.update('add', self.selected, x_tr, new_type='NUM',
+                        key=self.name_key)
             data.update('del', self.selected, None, None, key=self.name_key)
         return data
 
@@ -645,12 +734,14 @@ def _transform(self, data, y=None):
 class LabelEncode(Primitive):
     """ Label Encoder for categorical features.
 
-        The class applies Label Encoding to categorical features, By mapping each category to a numerical value.
+        The class applies Label Encoding to categorical features, By mapping each
+        category to a numerical value.
 
         # Arguments
-            cat_to_int_label: Dict. Mapping from categories to their assigned integer value
-            unknown_key_dict: Dict. Mapping for each categorical feature column to the integer value to replace the
-            previously unseen categories
+            cat_to_int_label: Dict. Mapping from categories to their assigned integer
+            value
+            unknown_key_dict: Dict. Mapping for each categorical feature column to
+            the integer value to replace the previously unseen categories
     """
     cat_to_int_label = None
     unknown_key_dict = None
@@ -660,26 +751,33 @@ def _fit(self, data, y=None):
         self.cat_to_int_label = {}
         self.unknown_key_dict = {}
         for col in self.selected:
-            self.cat_to_int_label[col] = {key: idx for idx, key in enumerate(set(data.X[col]))}
+            self.cat_to_int_label[col] = {key: idx for idx, key in
+                                          enumerate(set(data.X[col]))}
             self.unknown_key_dict[col] = len(self.cat_to_int_label[col])
         return self
 
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
-            x_tr[col] = data.X[col].apply(lambda key: self.cat_to_int_label[col].get(key, self.unknown_key_dict[col]))
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+            x_tr[col] = data.X[col].apply(
+                lambda key: self.cat_to_int_label[col].get(key,
+                                                           self.unknown_key_dict[
+                                                               col]))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class TargetEncoder(Primitive):
     """ Target Encoder for categorical features.
 
-        The class applies target encoding to categorical features, By learning the mapping of category to numeric value
+        The class applies target encoding to categorical features, By learning
+        the mapping of category to numeric value
         based on some aggregation of the target value.
 
         # Arguments
-            target_encoding_map: Dict. Mapping from categories to their assigned numeric value
+            target_encoding_map: Dict. Mapping from categories to their assigned
+            numeric value
     """
     target_encoding_map = None
     supported_ops = ('add', 'upd')
@@ -712,28 +810,34 @@ def _fit(self, data, y=None):
         self.target_encoding_map = {}
         X['target'] = y
         for col in self.selected:
-            self.target_encoding_map[col] = self.calc_smooth_mean(X, col, 'target', alpha=5)
+            self.target_encoding_map[col] = self.calc_smooth_mean(X, col, 'target',
+                                                                  alpha=5)
         X.drop('target', axis=1, inplace=True)
         return self
 
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
-            x_tr[col] = data.X[col].map(self.target_encoding_map[col][0], self.target_encoding_map[col][1])
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+            x_tr[col] = data.X[col].map(self.target_encoding_map[col][0],
+                                        self.target_encoding_map[col][1])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class CatCatEncoder(PrimitiveHigherOrder):
     """ Cross column feature generator between categorical and categorical columns.
 
-        The class learns a new features based on the values of selected two categorical features.
+        The class learns a new features based on the values of selected two
+        categorical features.
 
         # Arguments
-            cat_cat_map: Dict. Mapping from cat-cat combination to the an assigned numeric value
-            strategy: String. Aggregation strategy to learn the mapping between cat-cat combination to numeric value
+            cat_cat_map: Dict. Mapping from cat-cat combination to the an assigned
+            numeric value
+            strategy: String. Aggregation strategy to learn the mapping between
+            cat-cat combination to numeric value
     """
-    supported_ops = ('add', )
+    supported_ops = ('add',)
     cat_cat_map = None
     strategy = None
 
@@ -742,7 +846,8 @@ def init_vars(self, strategy='count'):
 
     @staticmethod
     def cat_cat_count(df, col1, col2, strategy='count'):
-        """ Generate mapping for cat-cat combination to the numerical value based on the given strategy.
+        """ Generate mapping for cat-cat combination to the numerical value based on
+        the given strategy.
         # Arguments
             col1: First categorical column
             col2: Second categorical column
@@ -755,35 +860,45 @@ def cat_cat_count(df, col1, col2, strategy='count'):
         elif strategy == 'nunique':
             mapping = df.groupby([col1])[col2].nunique()
         else:
-            mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[col2].nunique()
+            mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[
+                col2].nunique()
         return mapping
 
     def _fit(self, data, y=None):
         self.cat_cat_map = {}
         self.selected_1 = list(set(self.selected_1 + self.selected_2))
         for col1, col2 in itertools.combinations(self.selected_1, 2):
-            self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(data.X, col1, col2, self.strategy)
+            self.cat_cat_map[col1 + '_cross_' + col2] = \
+                self.cat_cat_count(data.X,
+                                   col1,
+                                   col2,
+                                   self.strategy)
         return self
 
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col1, col2 in itertools.combinations(self.selected_1, 2):
             if col1 + '_cross_' + col2 in self.cat_cat_map:
-                x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2])
-        data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key)
+                x_tr[col1 + '_cross_' + col2] = data.X[col1].map(
+                    self.cat_cat_map[col1 + '_cross_' + col2])
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class CatNumEncoder(PrimitiveHigherOrder):
     """ Cross column feature generator between categorical and numerical columns.
 
-        The class learns a new features based on the values of selected categorical and numerical features.
+        The class learns a new features based on the values of selected categorical
+        and numerical features.
 
         # Arguments
-            cat_num_map: Dict. Mapping from cat-num combination to the an assigned numeric value
-            strategy: String. Aggregation strategy to learn the mapping between cat-num combination to numeric value
+            cat_num_map: Dict. Mapping from cat-num combination to the an assigned
+            numeric value
+            strategy: String. Aggregation strategy to learn the mapping between
+            cat-num combination to numeric value
     """
-    supported_ops = ('add', )
+    supported_ops = ('add',)
     cat_num_map = None
     strategy = None
 
@@ -792,7 +907,8 @@ def init_vars(self, strategy='mean'):
 
     @staticmethod
     def cat_num_interaction(df, col1, col2, method='mean'):
-        """ Generate mapping for cat-num combination to the numerical value based on the given strategy.
+        """ Generate mapping for cat-num combination to the numerical value based on
+        the given strategy.
         # Arguments
             col1: categorical column
             col2: numerical column
@@ -817,7 +933,8 @@ def _fit(self, data, y=None):
         self.cat_num_map = {}
         for col1 in self.selected_1:
             for col2 in self.selected_2:
-                self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(data.X, col1, col2, self.strategy)
+                self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(
+                    data.X, col1, col2, self.strategy)
         return self
 
     def _transform(self, data, y=None):
@@ -825,21 +942,26 @@ def _transform(self, data, y=None):
         for col1 in self.selected_1:
             for col2 in self.selected_2:
                 if col1 + '_cross_' + col2 in self.cat_num_map:
-                    x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_num_map[col1 + '_cross_' + col2])
-        data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key)
+                    x_tr[col1 + '_cross_' + col2] = data.X[col1].map(
+                        self.cat_num_map[col1 + '_cross_' + col2])
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class CatBinEncoder(PrimitiveHigherOrder):
     """ Cross column feature generator between categorical and binary columns.
 
-        The class learns a new features based on the values of selected categorical and binary features.
+        The class learns a new features based on the values of selected categorical
+        and binary features.
 
         # Arguments
-            cat_bin_map: Dict. Mapping from cat-bin combination to the an assigned numeric value
-            strategy: String. Aggregation strategy to learn the mapping between cat-bin combination to numeric value
+            cat_bin_map: Dict. Mapping from cat-bin combination to the an assigned
+            numeric value
+            strategy: String. Aggregation strategy to learn the mapping between
+            cat-bin combination to numeric value
     """
-    supported_ops = ('add', )
+    supported_ops = ('add',)
     cat_bin_map = None
     strategy = None
 
@@ -848,7 +970,8 @@ def init_vars(self, strategy='percent_true'):
 
     @staticmethod
     def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
-        """ Generate mapping for cat-bin combination to the numerical value based on the given strategy.
+        """ Generate mapping for cat-bin combination to the numerical value based on
+        the given strategy.
         # Arguments
             col1: Categorical column
             col2: Binary column
@@ -868,7 +991,8 @@ def _fit(self, data, y=None):
         self.cat_bin_map = {}
         for col1 in self.selected_1:
             for col2 in self.selected_2:
-                self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(data.X, col1, col2, self.strategy)
+                self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(
+                    data.X, col1, col2, self.strategy)
         return self
 
     def _transform(self, data, y=None):
@@ -876,16 +1000,18 @@ def _transform(self, data, y=None):
         for col1 in self.selected_1:
             for col2 in self.selected_2:
                 if col1 + '_cross_' + col2 in self.cat_bin_map:
-                    x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2])
-        data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key)
+                    x_tr[col1 + '_cross_' + col2] = data.X[col1].map(
+                        self.cat_bin_map[col1 + '_cross_' + col2])
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class FilterConstant(Primitive):
     """ Filters the constant or very low variance columns.
 
-        The class finds the non-changing or very low variance columns and marked them for deletion, so that they are
-        not used by the machine learning estimator.
+        The class finds the non-changing or very low variance columns and marked them
+        for deletion, so that they are not used by the machine learning estimator.
     """
     drop_columns = None
     supported_ops = ('del',)
@@ -896,16 +1022,18 @@ def _fit(self, data, y=None):
         return self
 
     def _transform(self, data, y=None):
-        data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key)
+        data.update(self.operation, self.drop_columns, None, new_type=None,
+                    key=self.name_key)
         return data
 
 
 class TimeDiff(Primitive):
     """ Adds features based on difference of time values.
 
-        This class generates the features as time difference between two selected time columns.
+        This class generates the features as time difference between two selected
+        time columns.
     """
-    supported_ops = ('add', )
+    supported_ops = ('add',)
 
     def _fit(self, data, y=None):
         return self
@@ -914,14 +1042,16 @@ def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for a, b in itertools.combinations(self.selected, 2):
             x_tr[a + '-' + b] = data.X[a] - data.X[b]
-        data.update(self.operation, self.selected, x_tr, new_type='TIME', key=self.name_key)
+        data.update(self.operation, self.selected, x_tr, new_type='TIME',
+                    key=self.name_key)
         return data
 
 
 class TimeOffset(Primitive):
     """ Updates the time features in terms of difference from the start value.
 
-        This class updates the time features such that they are represented as a difference from the start time.
+        This class updates the time features such that they are represented as a
+        difference from the start time.
 
         # Arguments
             start_time: Int. Starting time of the selected time feature.
@@ -936,19 +1066,21 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         x_tr[self.selected] = data.X[self.selected] - self.start_time
-        data.update(self.operation, self.selected, x_tr, new_type='TIME', key=self.name_key)
+        data.update(self.operation, self.selected, x_tr, new_type='TIME',
+                    key=self.name_key)
         return data
 
 
 class TabPCA(Primitive):
     """ Generates new features by finding PCA of the selected features.
 
-        The class calculates the PCA of the selected features and adds the transformation as new set of features.
+        The class calculates the PCA of the selected features and adds the
+        transformation as new set of features.
         # Arguments
             pca: PCA. Scikit-lean PCA class.
     """
     pca = None
-    supported_ops = ('add', )
+    supported_ops = ('add',)
 
     def _fit(self, data, y=None):
         self.pca = PCA(n_components=0.99, svd_solver='full')
@@ -957,8 +1089,10 @@ def _fit(self, data, y=None):
 
     def _transform(self, data, y=None):
         x_pca = self.pca.transform(data.X[self.selected])
-        x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in range(x_pca.shape[1])])
-        data.update(self.operation, self.selected, x_pca, new_type='NUM', key=self.name_key)
+        x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in
+                                             range(x_pca.shape[1])])
+        data.update(self.operation, self.selected, x_pca, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
@@ -967,7 +1101,8 @@ class CatCount(Primitive):
 
         Replaces the cargorical variables by their occrance count.
         # Arguments
-            count_dict: Dict. Mapping of the categories to their respective frequency count.
+            count_dict: Dict. Mapping of the categories to their respective frequency
+            count.
             unknown_key: Float. Mapping value for previously unseen category.
     """
     count_dict = None
@@ -983,16 +1118,18 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
-            x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col].get(key, self.unknown_key))
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+            x_tr[col] = data.X[col].apply(
+                lambda key: self.count_dict[col].get(key, self.unknown_key))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class LogTransform(Primitive):
     """ Calculates the log transformation.
 
-        The class Calculates the log transform value of the given numeric feature. The formula is:
-        sign(x) * log(1 + mod(x))
+        The class Calculates the log transform value of the given numeric feature.
+        The formula is: sign(x) * log(1 + mod(x))
     """
     name_key = 'log_'
     supported_ops = ('add', 'upd')
@@ -1003,15 +1140,18 @@ def _fit(self, data, y=None):
     def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
-            x_tr[self.name_key + col] =  np.sign(data.X[col]) * np.log(1 + np.abs(data.X[col]))
-        data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key)
+            x_tr[self.name_key + col] = np.sign(data.X[col]) * np.log(
+                1 + np.abs(data.X[col]))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
         return data
 
 
 class Imputation(Primitive):
     """ Filters the features based on Pearson Correlation.
 
-        The class removes the features who have low pearson correlation with the target.
+        The class removes the features who have low pearson correlation with the
+        target.
         # Arguments
             threshold: Float. Threshold for filtering features.
     """
@@ -1027,7 +1167,8 @@ def _fit(self, data, y=None):
         for col in self.selected:
             if self.strategy == 'most_frequent':
                 value_counts = data.X[col].value_counts()
-                self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0
+                self.impute_dict[
+                    col] = value_counts.idxmax() if not value_counts.empty else 0
             elif self.strategy == 'zero':
                 self.impute_dict[col] = 0
             else:
@@ -1038,14 +1179,16 @@ def _transform(self, data, y=None):
         x_tr = pd.DataFrame()
         for col in self.selected:
             x_tr[col] = data.X[col].fillna(self.impute_dict[col])
-        data.update(self.operation, self.selected, x_tr, new_type=None, key=self.name_key)
+        data.update(self.operation, self.selected, x_tr, new_type=None,
+                    key=self.name_key)
         return data
 
 
 class FeatureFilter(Primitive):
     """ Filters the features based on Pearson Correlation.
 
-        The class removes the features who have low pearson correlation with the target.
+        The class removes the features who have low pearson correlation with the
+        target.
         # Arguments
             threshold: Float. Threshold for filtering features.
     """
@@ -1068,18 +1211,21 @@ def _fit(self, data, y=None):
         return self
 
     def _transform(self, data, y=None):
-        data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key)
+        data.update(self.operation, self.drop_columns, None, new_type=None,
+                    key=self.name_key)
         return data
 
 
 class FeatureImportance(Primitive):
     """ Filters the features based on feature importance score.
 
-        The class learns a Light GBM estimator for the given data and based on the feature importance scores, filters
-        the features with importance lower than the threshold.
+        The class learns a Light GBM estimator for the given data and based on the
+        feature importance scores, filters the features with importance lower than
+        the threshold.
         # Arguments
             threshold: Float. Threshold for filtering features.
-            task_type: 'String'. Specifies the task type amongst: ('classification', 'regression')
+            task_type: 'String'. Specifies the task type amongst: ('classification',
+            'regression')
     """
     threshold = None
     task_type = 'classification'
@@ -1114,12 +1260,14 @@ def _fit(self, data, y=None):
                                       objective='regression')
         estimator.fit(data.X, y)
         feature_importance = estimator.feature_importances_
-        feature_importance = feature_importance/feature_importance.mean()
-        self.drop_columns = data.X.columns[np.where(feature_importance < self.threshold)[0]]
+        feature_importance = feature_importance / feature_importance.mean()
+        self.drop_columns = data.X.columns[
+            np.where(feature_importance < self.threshold)[0]]
         return self
 
     def _transform(self, data, y=None):
-        data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key)
+        data.update(self.operation, self.drop_columns, None, new_type=None,
+                    key=self.name_key)
         return data
 
 
@@ -1141,8 +1289,8 @@ def _transform(self, data, y=None):
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     print(x_train[:4, 20])
     prep = Preprocessor()
-    prep.fit(x_train, y_train, 24*60*60, datainfo)
+    prep.fit(x_train, y_train, 24 * 60 * 60, datainfo)
     x_new = prep.transform(x_train)
 
     print("-----")
-    print(x_new[:4, 2])
\ No newline at end of file
+    print(x_new[:4, 2])
diff --git a/autokaggle/utils.py b/autokaggle/utils.py
index e93f7d8..bb3aa69 100644
--- a/autokaggle/utils.py
+++ b/autokaggle/utils.py
@@ -8,7 +8,8 @@
 def generate_rand_string(size):
     chars = string.ascii_uppercase + string.digits
     return ''.join(random.choice(chars) for _ in range(size))
-    
+
+
 def ensure_dir(directory):
     """Create directory if it does not exist."""
     if not os.path.exists(directory):
@@ -22,20 +23,26 @@ def temp_path_generator():
 
 
 def rand_temp_folder_generator():
-    """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras)."""
+    """
+    Create and return a temporary directory with the path name
+    '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras).
+    """
     sys_temp = temp_path_generator()
     path = sys_temp + '_' + generate_rand_string(6)
     ensure_dir(path)
     return path
 
+
 def write_json(data, filename):
     with open(filename, 'w') as outfile:
         json.dump(data, outfile)
-        
+
+
 def read_json(filename):
     with open(filename, 'rb') as infile:
         return json.load(infile)
 
+
 def write_csv(filename, line):
     with open(filename, "a") as f:
         f.write(", ".join(map(str, line)))
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 0b5b22a..57cd47e 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -1,18 +1,21 @@
 import string
 import random
 import sys
+
 sys.path.append("../")
 import numpy as np
 import pandas as pd
 import sklearn.model_selection
 import sklearn.datasets
-from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, balanced_accuracy_score,\
-mean_absolute_error, mean_squared_error
+from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, \
+    balanced_accuracy_score, \
+    mean_absolute_error, mean_squared_error
 # from autosklearn.regression import AutoSklearnRegressor
 # from autosklearn.classification import AutoSklearnClassifier
 from autokaggle import *
 from autokaggle.utils import *
 import openml
+
 openml.config.apikey = '3c7196c92a274c3b9405a7e26e9f848e'
 import warnings
 from abc import abstractmethod
@@ -27,22 +30,25 @@ def generate_rand_string(size):
 class BenchmarkingBase:
     """ Base class for benchmarking autoML platforms.
 
-        This class benchmarks the performance of the given autoML platform. The user can call evaluate() method to
-        evaluate the performance on a single task or run_automation() for the list of the tasks. The tasks are OpenML
+        This class benchmarks the performance of the given autoML platform. The
+        user can call evaluate() method to evaluate the performance on a single
+        task or run_automation() for the list of the tasks. The tasks are OpenML
         tasks, which specify the dataset and the train/test/validation folds etc.
 
         # Arguments
             results: List. List of the results for each evaluation
-            sess_name: String. Name of the evaluation session, used for storing the results.
+            sess_name: String. Name of the evaluation session, used for storing
+            the results.
             cls_desc: List. List of the columns to be added in classification result
             rgs_desc: List. List of the columns to be added in regression result
             cls_results: DataFrame. Table storing the classification results
             rgs_results: DataFrame. Table storing the regression results
     """
     results = None
-    cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", "balanced_accuracy", "F1_score", "AUC"]
+    cls_desc = ["automl_model", "task_id", "time_limit", "accuracy",
+                "balanced_accuracy", "F1_score", "AUC"]
     rgs_desc = ["automl_model", "task_id", "time_limit", "MSE", "MAE", "R2_score"]
-    
+
     def __init__(self, supress_warnings=True, sess_name=""):
         if supress_warnings:
             warnings.filterwarnings('ignore')
@@ -50,7 +56,7 @@ def __init__(self, supress_warnings=True, sess_name=""):
         self.sess_name = generate_rand_string(6) if not sess_name else sess_name
         self.cls_results = pd.DataFrame(columns=self.cls_desc)
         self.rgs_results = pd.DataFrame(columns=self.rgs_desc)
-        
+
     def measure_performance_cls(self, y_true, y_pred, binary=False):
         """ Calculate the performance of the classification task
         # Arguments
@@ -62,7 +68,8 @@ def measure_performance_cls(self, y_true, y_pred, binary=False):
         """
         accuracy = accuracy_score(y_true, y_pred)
         ber = balanced_accuracy_score(y_true, y_pred)
-        f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score(y_true, y_pred, average="weighted")
+        f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score(
+            y_true, y_pred, average="weighted")
         auc = roc_auc_score(y_true, y_pred) if binary else "-"
         return [accuracy, ber, f1, auc]
 
@@ -78,7 +85,7 @@ def measure_performance_rgs(self, y_true, y_pred):
         mae = mean_absolute_error(y_true, y_pred)
         r2 = r2_score(y_true, y_pred)
         return [mse, mae, r2]
-    
+
     def export_results(self):
         """ Writes the results to a CSV file.
         # Arguments
@@ -87,10 +94,12 @@ def export_results(self):
             None
         """
         if len(self.cls_results) > 0:
-            self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False)
+            self.cls_results.to_csv(self.sess_name + "_classification_results.csv",
+                                    index=False)
         if len(self.rgs_results) > 0:
-            self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", index=False)
-    
+            self.rgs_results.to_csv(self.sess_name + "_regression_results.csv",
+                                    index=False)
+
     @abstractmethod
     def evaluate(self, task, time_limit):
         """ Evaluates the performance of the single task.
@@ -101,8 +110,8 @@ def evaluate(self, task, time_limit):
             List of performance scores of the autoML system on the given task.
         """
         pass
-        
-    def run_automation(self, task_list, time_limit=10*60):
+
+    def run_automation(self, task_list, time_limit=10 * 60):
         """ Evaluate the list of the tasks in sequence
         # Arguments
             task_list: List of OpenML task ids
@@ -116,22 +125,24 @@ def run_automation(self, task_list, time_limit=10*60):
                 self.export_results()
             except:
                 print("task: {} didnt work".format(task))
-                
-    def time_lapse(self, task_id, time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]):
+
+    def time_lapse(self, task_id,
+                   time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]):
         """ Evaluate the task on different time_limits
         # Arguments
             task_id: Id of the OpenML task flow
             time_limits: List of the time_limits to test the performance on
         # Returns
             List of combined results of the autoML on each of the time_limit
-        This function evaluates and compares the performance of the autoML system on different time_limits. It is
-        helpful to understand the amount of improvement with increase in time budget
+        This function evaluates and compares the performance of the autoML system
+        on different time_limits. It is helpful to understand the amount of
+        improvement with increase in time budget
         """
         tl_results = []
         for time_limit in time_limits:
             tl_results.append(self.evaluate(task_id, time_limit=time_limit))
         return tl_results
-    
+
     def get_dataset_splits(self, task_id):
         """ Get the train/test splits for the given task
         # Arguments
@@ -142,18 +153,21 @@ def get_dataset_splits(self, task_id):
         task = openml.tasks.get_task(task_id)
         train_indices, test_indices = task.get_train_test_split_indices()
         dataset = task.get_dataset()
-        X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array')
+        X, y, categorical_indicator, attribute_names = dataset.get_data(
+            target=task.target_name, dataset_format='array')
 
         x_train, y_train = X[train_indices], y[train_indices]
         x_test, y_test = X[test_indices], y[test_indices]
         return x_train, y_train, x_test, y_test
-    
-    
+
+
 class BenchmarkingAutoKaggle(BenchmarkingBase):
     """ Extends the benchmarking class for evaluating AutoKaggle.
 
-        This class evaluates the performance of AutoKaggle on the input classification or regression task_list.
+        This class evaluates the performance of AutoKaggle on the input
+        classification or regression task_list.
     """
+
     def get_data_info(self, dataset, num_cols):
         """ Get the info of each feature data type
         # Arguments
@@ -176,8 +190,8 @@ def get_data_info(self, dataset, num_cols):
             else:
                 data_info.append("CAT")
         return np.array(data_info)
-    
-    def evaluate(self, task_id, time_limit=10*60):
+
+    def evaluate(self, task_id, time_limit=10 * 60):
         """
             See base class.
         """
@@ -185,7 +199,8 @@ def evaluate(self, task_id, time_limit=10*60):
         task = openml.tasks.get_task(task_id)
         train_indices, test_indices = task.get_train_test_split_indices()
         dataset = task.get_dataset()
-        X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array')
+        X, y, categorical_indicator, attribute_names = dataset.get_data(
+            target=task.target_name, dataset_format='array')
 
         x_train, y_train = X[train_indices], y[train_indices]
         x_test, y_test = X[test_indices], y[test_indices]
@@ -200,16 +215,17 @@ def evaluate(self, task_id, time_limit=10*60):
             automl = Regressor()
         else:
             print("UNSUPPORTED TASK_TYPE")
-            assert(0)
+            assert (0)
 
         automl.fit(x_train, y_train, time_limit=time_limit, data_info=data_info)
 
         # Evaluate
         y_hat = automl.predict(x_test)
-        
+
         if task.task_type == 'Supervised Classification':
             is_binary = True if len(task.class_labels) <= 2 else False
-            result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary)
+            result = task_info + self.measure_performance_cls(y_test, y_hat,
+                                                              binary=is_binary)
             self.cls_results.loc[len(self.cls_results)] = result
         elif task.task_type == 'Supervised Regression':
             result = task_info + self.measure_performance_rgs(y_test, y_hat)
@@ -217,21 +233,26 @@ def evaluate(self, task_id, time_limit=10*60):
         print(result)
         return result
 
-#
-# class BenchmarkingAutoSklearn(BenchmarkingBase):
+    #
+    # class BenchmarkingAutoSklearn(BenchmarkingBase):
     """ Extends the benchmarking class for evaluating AutoSklearn.
     
-        This class evaluates the performance of AutoKaggle on the input classification or regression task_list.
+        This class evaluates the performance of AutoKaggle on the input 
+        classification or regression task_list.
     """
+
+
 #     def get_data_info(self, categorical_indicator):
-#         return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator]
+#         return ['Categorical' if ci else 'Numerical' for ci in categorical
+#         indicator]
 #
 #     def evaluate(self, task_id, time_limit=10*60):
 #         task_info = ["autosklearn", task_id, time_limit]
 #         task = openml.tasks.get_task(task_id)
 #         train_indices, test_indices = task.get_train_test_split_indices()
 #         dataset = task.get_dataset()
-#         X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array')
+#         X, y, categorical_indicator, attribute_names = dataset.get_data(
+#         target=task.target_name, dataset_format='array')
 #
 #         x_train, y_train = X[train_indices], y[train_indices]
 #         x_test, y_test = X[test_indices], y[test_indices]
@@ -257,7 +278,8 @@ def evaluate(self, task_id, time_limit=10*60):
 #         y_hat = automl.predict(x_test)
 #         if task.task_type == 'Supervised Classification':
 #             is_binary = True if len(task.class_labels) <= 2 else False
-#             result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary)
+#             result = task_info + self.measure_performance_cls(y_test, y_hat,
+#             binary=is_binary)
 #             self.cls_results.loc[len(self.cls_results)] = result
 #         elif task.task_type == 'Supervised Regression':
 #             result = task_info + self.measure_performance_rgs(y_test, y_hat)
@@ -286,7 +308,8 @@ def get_task_info(task_ids):
         task_ids: List of ids of OpenML task flows.
     # Returns
         dataset_list: List of the dataset Ids.
-        task_types: List of the task type (such as 'binary/multiclass classification' or 'regression'
+        task_types: List of the task type (such as 'binary/multiclass
+        classification' or 'regression'
     """
     task_types = []
     dataset_list = []
@@ -294,8 +317,11 @@ def get_task_info(task_ids):
         task = openml.tasks.get_task(t_id)
         dataset = openml.datasets.get_dataset(task.dataset_id)
         if task.task_type_id == 1:
-            _, y, _, _ = dataset.get_data(target=task.target_name, dataset_format='array')
-            task_type = "Binary Classification" if len(set(y)) <= 2 else "Multiclass classification ({})".format(len(set(y)))
+            _, y, _, _ = dataset.get_data(target=task.target_name,
+                                          dataset_format='array')
+            task_type = "Binary Classification" if len(
+                set(y)) <= 2 else "Multiclass classification ({})".format(
+                len(set(y)))
         else:
             task_type = "Regression"
         task_types.append(task_type)
@@ -309,16 +335,19 @@ def get_dataset_properties(task_ids):
         task_ids: List of ids of OpenML task flows
     # Returns
         Dataframe containing the info of each of the dataset.
-    This function provides the dataset info such as number of instances, number of numeric/nominal/string columns etc.
+    This function provides the dataset info such as number of instances, number of
+    numeric/nominal/string columns etc.
     """
     dataset_list, task_types = get_task_info(task_ids)
-    df = pd.DataFrame(columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", "#Date"])
+    df = pd.DataFrame(
+        columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String",
+                 "#Date"])
     for i, dataset in enumerate(dataset_list):
         df.loc[i] = [
-            dataset.name, 
+            dataset.name,
             dataset.qualities["NumberOfInstances"],
             task_types[i],
-            len(dataset.get_features_by_type('numeric')), 
+            len(dataset.get_features_by_type('numeric')),
             len(dataset.get_features_by_type('nominal')),
             len(dataset.get_features_by_type('string')),
             len(dataset.get_features_by_type('date')),
@@ -332,23 +361,27 @@ def get_performance_table(filename, metric):
         filename: A csv file containing the results of AutoML runs
         metric: Scoring metric to be used for comparison
     # Returns
-        Pandas Dataframe listing the performance of different AutoML systems on the given datasets.
-    This function reads the results csv and converts it into the performance table based on the median of the results
-    for each task.
+        Pandas Dataframe listing the performance of different AutoML systems on
+        the given datasets.
+    This function reads the results csv and converts it into the performance table
+    based on the median of the results for each task.
     """
     test = pd.read_csv(filename)
     perf = pd.DataFrame(columns=["Name", "AutoKaggle", "AutoSklearn", "H2O.ai"])
     task_ids = list(set(test["task_id"]))
     dataset_ids = get_dataset_ids(task_ids)
-    
+
     test = test.set_index(["task_id", "automl_model"])
     test.sort_index(inplace=True)
     for i, t_id in enumerate(task_ids):
         try:
             name = openml.datasets.get_dataset(dataset_ids[i]).name
-            auto_kaggle = test.loc[(t_id, "autokaggle")][metric].median() if (t_id, "autokaggle") in test.index else np.nan
-            auto_sklearn = test.loc[(t_id, "autosklearn")][metric].median() if (t_id, "autosklearn") in test.index else np.nan
-            h2o_ai = test.loc[(t_id, "autosklearn")][metric].median() if (t_id, "autosklearn") in test.index else np.nan
+            auto_kaggle = test.loc[(t_id, "autokaggle")][metric].median()\
+                if (t_id, "autokaggle") in test.index else np.nan
+            auto_sklearn = test.loc[(t_id, "autosklearn")][metric].median()\
+                if (t_id, "autosklearn") in test.index else np.nan
+            h2o_ai = test.loc[(t_id, "autosklearn")][metric].median()\
+                if (t_id, "autosklearn") in test.index else np.nan
             perf.loc[i] = [name, auto_kaggle, auto_sklearn, h2o_ai]
         except Exception as e:
             print(e)
@@ -362,12 +395,14 @@ def style_results(res):
     # Returns
         Highlighed data-frame
     """
+
     def highlight_max(s):
         """
         Highlight the maximum in a Series yellow.
         """
         is_max = s == s.max()
         return ['background-color: yellow' if v else '' for v in is_max]
+
     res = res.set_index("Name")
     res.style.apply(highlight_max, axis=1)
     return res
@@ -381,7 +416,8 @@ def get_box_plot(results, task_id, metric):
         metric: Score metric considered for the box-plot
     # Returns
         None
-    Builds and displays the box plot showing the variance in results for the AutoML performance on the given dataset.
+    Builds and displays the box plot showing the variance in results for the
+    AutoML performance on the given dataset.
     """
     auto_sklearn = list(results.loc[(task_id, "autosklearn")][metric])
     auto_kaggle = list(results.loc[(task_id, "autokaggle")][metric])
@@ -391,13 +427,16 @@ def get_box_plot(results, task_id, metric):
         auto_sklearn.append(med_sk)
     while len(auto_sklearn) > len(auto_kaggle):
         auto_kaggle.append(med_ak)
-    temp = pd.DataFrame(data={"Autokaggle":auto_kaggle, "AutoSklearn":auto_sklearn})
+    temp = pd.DataFrame(
+        data={"Autokaggle": auto_kaggle, "AutoSklearn": auto_sklearn})
     temp.boxplot()
 
 
 if __name__ == "__main__":
-    regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839]
-    classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101,
+    regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834,
+                            4850, 4839]
+    classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954,
+                                14951, 59, 24, 146230, 31, 10101,
                                 9914, 3020, 3524, 3573, 3962]
     ak = BenchmarkingAutoKaggle(sess_name='test_perf')
     import time
@@ -410,7 +449,8 @@ def get_box_plot(results, task_id, metric):
     np.random.seed(1001)
     random.seed(1001)
     import time
+
     t1 = time.time()
     ak.evaluate(3021)
     t2 = time.time()
-    print(t2-t1)
+    print(t2 - t1)