From 4655af1c80235384393a0dfa70d4fe3d52982ef7 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Sat, 20 Jul 2019 06:16:13 -0500 Subject: [PATCH 01/31] Class Design + Ensemble --- autokaggle/__init__.py | 2 + autokaggle/ensemblers.py | 139 +++++++++ autokaggle/hparam_space/knn_hp.json | 1 + autokaggle/hparam_space/lgbm_hp.json | 1 + autokaggle/hparam_space/rf_hp.json | 1 + autokaggle/hparam_space/svm_hp.json | 1 + autokaggle/tabular_classifiers.py | 57 ++++ autokaggle/tabular_regressors.py | 34 ++ autokaggle/tabular_supervised.py | 293 +++++++----------- autokaggle/utils.py | 8 + examples/tabular_classification_binary.py | 6 +- examples/tabular_classification_multiclass.py | 4 +- examples/tabular_regression.py | 2 +- 13 files changed, 358 insertions(+), 191 deletions(-) create mode 100644 autokaggle/ensemblers.py create mode 100644 autokaggle/hparam_space/knn_hp.json create mode 100644 autokaggle/hparam_space/lgbm_hp.json create mode 100644 autokaggle/hparam_space/rf_hp.json create mode 100644 autokaggle/hparam_space/svm_hp.json create mode 100644 autokaggle/tabular_classifiers.py create mode 100644 autokaggle/tabular_regressors.py diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py index e69de29..14439bc 100644 --- a/autokaggle/__init__.py +++ b/autokaggle/__init__.py @@ -0,0 +1,2 @@ +from .tabular_preprocessor import * +from .tabular_supervised import * \ No newline at end of file diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py new file mode 100644 index 0000000..9081211 --- /dev/null +++ b/autokaggle/ensemblers.py @@ -0,0 +1,139 @@ +from sklearn.base import BaseEstimator +from tabular_preprocessor import TabularPreprocessor +from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +from abc import abstractmethod +import numpy as np +import os +import random +import json + +from lightgbm import LGBMClassifier, LGBMRegressor +from sklearn.model_selection import RandomizedSearchCV +from sklearn.model_selection import StratifiedKFold, KFold +from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error +from joblib import dump, load + + +class RankedEnsembler: + def __init__(self, ensemble_method='max_voting'): + self.ensemble_method = ensemble_method + + def fit(self, predictions, y_true): + pass + + def predict(self, predictions): + if self.ensemble_method == 'median': + return predictions.apply(np.median, axis=1).values + elif self.ensemble_method == 'mean': + return predictions.apply(np.average, axis=1).values + elif self.ensemble_method == 'max': + return predictions.apply(np.max, axis=1).values + elif self.ensemble_method == 'min': + return predictions.apply(np.min, axis=1).values + elif self.ensemble_method == 'max_voting': + return predictions.apply(mode, axis=1).values + +class EnsembleSelection: + def __init__(self, task_type, metric, mode='fast'): + self.ensemble_size = None + self.mode = mode + self.task_type = task_type + self.metric = metric + + def fit(self, predictions, y_true): + if self.mode == 'fast': + self.fast(predictions, y_true) + else: + self.slow(predictions, y_true) + self._calculate_weights() + + def predict(self, predictions): + return np.average(predictions, axis=0, weights=self.weights_) + + def fast(self, predictions, labels): + """Fast version of Rich Caruana's ensemble selection method.""" + self.ensemble_size = len(predictions) + + ensemble = [] + trajectory = [] + order = [] + + for i in range(self.ensemble_size): + scores = np.zeros((len(predictions))) + s = len(ensemble) + if s == 0: + weighted_ensemble_prediction = np.zeros(predictions[0].shape) + else: + # Memory-efficient averaging! + ensemble_prediction = np.zeros(ensemble[0].shape) + for pred in ensemble: + ensemble_prediction += pred + ensemble_prediction /= s + + weighted_ensemble_prediction = (s / float(s + 1)) * \ + ensemble_prediction + fant_ensemble_prediction = np.zeros(weighted_ensemble_prediction.shape) + for j, pred in enumerate(predictions): + fant_ensemble_prediction[:,:] = weighted_ensemble_prediction + \ + (1. / float(s + 1)) * pred + scores[j] = calculate_score( + solution=labels, + prediction=fant_ensemble_prediction, + task_type=self.task_type, + metric=self.metric, + all_scoring_functions=False) + + all_best = np.argwhere(scores == np.nanmin(scores)).flatten() + best = np.random.RandomState.choice(all_best) + ensemble.append(predictions[best]) + trajectory.append(scores[best]) + order.append(best) + + # Handle special case + if len(predictions) == 1: + break + + self.indices_ = order + + def slow(self, predictions, labels): + """Rich Caruana's ensemble selection method.""" + self.ensemble_size = len(predictions) + + ensemble = [] + trajectory = [] + order = [] + + for i in range(self.ensemble_size): + scores = np.zeros([predictions.shape[0]]) + for j, pred in enumerate(predictions): + ensemble.append(pred) + ensemble_prediction = np.mean(np.array(ensemble), axis=0) + scores[j] = calculate_score( + solution=labels, + prediction=ensemble_prediction, + task_type=self.task_type, + metric=self.metric, + all_scoring_functions=False) + ensemble.pop() + best = np.nanargmin(scores) + ensemble.append(predictions[best]) + trajectory.append(scores[best]) + order.append(best) + + # Handle special case + if len(predictions) == 1: + break + + self.indices_ = np.array(order) + + def _calculate_weights(self): + ensemble_members = Counter(self.indices_).most_common() + weights = np.zeros((self.ensemble_size,), dtype=float) + for ensemble_member in ensemble_members: + weight = float(ensemble_member[1]) / self.ensemble_size + weights[ensemble_member[0]] = weight + + if np.sum(weights) < 1: + weights = weights / np.sum(weights) + + self.weights_ = weights \ No newline at end of file diff --git a/autokaggle/hparam_space/knn_hp.json b/autokaggle/hparam_space/knn_hp.json new file mode 100644 index 0000000..609ff31 --- /dev/null +++ b/autokaggle/hparam_space/knn_hp.json @@ -0,0 +1 @@ +{"n_neighbors": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], "metric": ["minkowski", "euclidean", "manhattan", "chebyshev"], "p": [1, 2, 3]} \ No newline at end of file diff --git a/autokaggle/hparam_space/lgbm_hp.json b/autokaggle/hparam_space/lgbm_hp.json new file mode 100644 index 0000000..b694b11 --- /dev/null +++ b/autokaggle/hparam_space/lgbm_hp.json @@ -0,0 +1 @@ +{"boosting_type": ["gbdt"], "min_split_gain": [0.1], "max_depth": [5, 8, 10], "min_child_weight": [5, 10, 30, 50, 60, 80, 100], "colsample_bytree": [0.6, 0.7], "learning_rate": [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12], "subsample": [0.8], "num_leaves": [80], "n_estimators": [50, 100, 150, 200]} \ No newline at end of file diff --git a/autokaggle/hparam_space/rf_hp.json b/autokaggle/hparam_space/rf_hp.json new file mode 100644 index 0000000..a56221a --- /dev/null +++ b/autokaggle/hparam_space/rf_hp.json @@ -0,0 +1 @@ +{"criterion": ["entropy", "gini"], "max_features": [0.1, 0.3, 0.5, 0.7, 1.0], "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_split": [2, 3, 4], "min_samples_leaf": [1, 2, 3], "n_estimators": [50, 100, 150, 200]} \ No newline at end of file diff --git a/autokaggle/hparam_space/svm_hp.json b/autokaggle/hparam_space/svm_hp.json new file mode 100644 index 0000000..8642e7f --- /dev/null +++ b/autokaggle/hparam_space/svm_hp.json @@ -0,0 +1 @@ +{"C": [0.001, 0.1, 1, 10, 100, 1000, 10000], "gamma": [1e-05, 100000.0], "kernel": ["rbf", "poly", "linear", "sigmoid"], "degree": [2, 3, 4, 5], "max_iter": [50000]} \ No newline at end of file diff --git a/autokaggle/tabular_classifiers.py b/autokaggle/tabular_classifiers.py new file mode 100644 index 0000000..e832e93 --- /dev/null +++ b/autokaggle/tabular_classifiers.py @@ -0,0 +1,57 @@ +from sklearn.base import BaseEstimator +from tabular_preprocessor import TabularPreprocessor +from tabular_supervised import AutoKaggle, TabularEstimator, LGBMMixIn, SklearnMixIn +from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +from abc import abstractmethod +import numpy as np +import os +import random +import json + +from lightgbm import LGBMClassifier, LGBMRegressor +from sklearn.model_selection import RandomizedSearchCV +from sklearn.model_selection import StratifiedKFold, KFold +from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error +from joblib import dump, load + +class TabularClassifier(TabularEstimator): + """TabularClassifier class. + It is used for tabular data classification with lightgbm classifier. + """ + def __init__(self, path=None, verbose=True, time_limit=None): + super().__init__(path, verbose, time_limit) + self.objective = 'classification' + + def get_skf(self, folds): + if self.objective == 'binary': + score_metric = 'roc_auc' + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) + else: + score_metric = 'f1_weighted' + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) + return score_metric, skf + +class SVMClassifier(TabularClassifier, SklearnMixIn): + _default_hyperparams = "svm_hp.json" + + def init_model(self, y): + n_classes = len(set(y)) + self.objective = 'binary' if n_classes == 2 else 'multiclass' + self.estimator = LinearSVC() + +class LgbmClassifier(TabularClassifier, LGBMMixIn): + def init_model(self, y): + n_classes = len(set(y)) + if n_classes == 2: + self.objective = 'binary' + self.estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + objective=self.objective) + else: + self.objective = 'multiclass' + self.estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + num_class=n_classes, + objective=self.objective) \ No newline at end of file diff --git a/autokaggle/tabular_regressors.py b/autokaggle/tabular_regressors.py new file mode 100644 index 0000000..1b6d358 --- /dev/null +++ b/autokaggle/tabular_regressors.py @@ -0,0 +1,34 @@ +from sklearn.base import BaseEstimator +from tabular_preprocessor import TabularPreprocessor +from tabular_supervised import AutoKaggle, TabularEstimator, LGBMMixIn, SklearnMixIn +from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +from abc import abstractmethod +import numpy as np +import os +import random +import json + +from lightgbm import LGBMClassifier, LGBMRegressor +from sklearn.model_selection import RandomizedSearchCV +from sklearn.model_selection import StratifiedKFold, KFold +from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error +from joblib import dump, load + + +class TabularRegressor(TabularEstimator): + """TabularRegressor class. + It is used for tabular data regression with lightgbm regressor. + """ + def __init__(self, path=None, verbose=True, time_limit=None): + super().__init__(path, verbose, time_limit) + self.objective = 'regression' + + def get_skf(self, folds): + return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001) + +class LgbmRegressor(TabularRegressor, LGBMMixIn): + def init_model(self, y): + self.estimator = LGBMRegressor(silent=False, + verbose=-1, + n_jobs=1, + objective=self.objective) \ No newline at end of file diff --git a/autokaggle/tabular_supervised.py b/autokaggle/tabular_supervised.py index 3f74390..64353eb 100644 --- a/autokaggle/tabular_supervised.py +++ b/autokaggle/tabular_supervised.py @@ -1,72 +1,39 @@ +from sklearn.base import BaseEstimator +from tabular_preprocessor import TabularPreprocessor +from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json from abc import abstractmethod - +import numpy as np import os +import random +import json + from lightgbm import LGBMClassifier, LGBMRegressor from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error -import numpy as np -import random - -from autokaggle.tabular_preprocessor import TabularPreprocessor -from autokaggle.utils import rand_temp_folder_generator, ensure_dir +from joblib import dump, load -class TabularSupervised: - def __init__(self, path=None, verbose=True): +class AutoKagggle(BaseEstimator): + def __init__(self, estimator_class=LgbmClassifier, path=None, verbose=True): """ Initialization function for tabular supervised learner. """ self.verbose = verbose self.is_trained = False - self.clf = None self.objective = None self.tabular_preprocessor = None + self.model = None + self.estimator_class = estimator_class self.path = path if path is not None else rand_temp_folder_generator() ensure_dir(self.path) if self.verbose: print('Path:', path) - self.save_filename = os.path.join(self.path, 'lgbm.txt') self.time_limit = None - self.lgbm = None - - def search(self, search_space, search_iter, n_estimators, x, y): - if 'n_estimators' in search_space: - del search_space['n_estimators'] - params = { - 'boosting_type': ['gbdt'], - 'min_child_weight': [5], - 'min_split_gain': [1.0], - 'subsample': [0.8], - 'colsample_bytree': [0.6], - 'max_depth': [10], - 'n_estimators': n_estimators, - 'num_leaves': [70], - 'learning_rate': [0.04], - } - params.update(search_space) - if self.verbose: - print(params) - folds = 3 - score_metric, skf = self.get_skf(folds) - - random_search = RandomizedSearchCV(self.lgbm, param_distributions=params, n_iter=search_iter, - scoring=score_metric, - n_jobs=1, cv=skf, verbose=0, random_state=1001) - - random_search.fit(x, y) - self.clf = random_search.best_estimator_ - - return random_search.best_params_ - - @abstractmethod - def get_skf(self, folds): - pass def fit(self, x, y, time_limit=None, data_info=None): """ This function should train the model parameters. - Args: x: A numpy.ndarray instance containing the training data. y: training label vector. @@ -84,173 +51,129 @@ def fit(self, x, y, time_limit=None, data_info=None): if time_limit is None: time_limit = 24 * 60 * 60 self.time_limit = time_limit - - self.init_lgbm(y) - - self.tabular_preprocessor = TabularPreprocessor() - + if x.shape[1] == 0: raise ValueError("No feature exist!") - x = self.tabular_preprocessor.fit(x, y, self.time_limit, data_info) - - if x.shape[0] > 600: - grid_train_percentage = max(600.0 / x.shape[0], 0.1) - else: - grid_train_percentage = 1 - grid_n = int(x.shape[0] * grid_train_percentage) - idx = random.sample(list(range(x.shape[0])), grid_n) - - grid_train_x = x[idx, :] - grid_train_y = y[idx] - while x.shape[0] < 60: x = np.concatenate([x, x], axis=0) y = np.concatenate([y, y], axis=0) - - response_rate = sum(y) / len(y) - - if not self.is_trained: - # Two-step cross-validation for hyperparameter selection - if self.verbose: - print('-----------------Search Regularization Params---------------------') - if response_rate < 0.005: - depth_choice = [5] - else: - depth_choice = [8, 10] - - params = { - 'min_split_gain': [0.1], - 'max_depth': depth_choice, - 'min_child_weight': [5, 10, 30, 50, 60, 80, 100], - 'colsample_bytree': [0.6, 0.7], - 'learning_rate': [0.3], - 'subsample': [0.8], - 'num_leaves': [80], - } - - search_iter = 14 - n_estimators_choice = [50] - best_param = self.search( - params, - search_iter, - n_estimators_choice, - grid_train_x, grid_train_y) - - if self.verbose: - print('-----------------Search Learning Rate---------------------') - for key, value in best_param.items(): - best_param[key] = [value] - best_param['learning_rate'] = [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12] - n_estimators_choice = [100, 150, 200] - search_iter = 16 - - self.search( - best_param, - search_iter, - n_estimators_choice, - grid_train_x, grid_train_y) - - if self.verbose: - print('self.clf', self.clf) - self.is_trained = True - - # Fit Model - self.clf.fit(x, y) - - self.clf.booster_.save_model(self.save_filename) + + # Init model and preprocessor + self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit) + self.tabular_preprocessor = TabularPreprocessor() + + # Fit Model and preprocessor + x = self.tabular_preprocessor.fit(x, y, self.time_limit, data_info) + self.model.fit(x, y) + self.model.save_model() + self.is_trained = True if self.verbose: print("The whole available data is: ") print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1])) - print('Feature Importance:') - print(self.clf.feature_importances_) - - @abstractmethod - def init_lgbm(self, y): - pass - def predict(self, x_test): """ This function should provide predictions of labels on (test) data. The function predict eventually casdn return probabilities or continuous values. """ x_test = self.tabular_preprocessor.encode(x_test) - y = self.clf.predict(x_test, ) + y = self.model.predict(x_test, ) if y is None: raise ValueError("Tabular predictor does not exist") return y - @abstractmethod - def evaluate(self, x_test, y_test): - pass - - def final_fit(self, x_train, y_train): - x_train = self.tabular_preprocessor.encode(x_train) - self.clf.fit(x_train, y_train) - - -class TabularRegressor(TabularSupervised): - """TabularRegressor class. - It is used for tabular data regression with lightgbm regressor. - """ - - def __init__(self, path=None): - super().__init__(path) - self.objective = 'regression' - - def evaluate(self, x_test, y_test): - y_pred = self.predict(x_test) - return mean_squared_error(y_test, y_pred) - - def init_lgbm(self, y): - self.lgbm = LGBMRegressor(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) - - def get_skf(self, folds): - return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001) - - -class TabularClassifier(TabularSupervised): - """TabularClassifier class. - It is used for tabular data classification with lightgbm classifier. - """ - - def init_lgbm(self, y): - n_classes = len(set(y)) - if n_classes == 2: - self.objective = 'binary' - self.lgbm = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) - else: - self.objective = 'multiclass' - self.lgbm = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - num_class=n_classes, - objective=self.objective) - def evaluate(self, x_test, y_test): if self.verbose: - print('objective:', self.objective) + print('objective:', self.model.objective) y_pred = self.predict(x_test) results = None - if self.objective == 'binary': + if self.model.objective == 'binary': results = roc_auc_score(y_test, y_pred) - elif self.objective == 'multiclass': + elif self.model.objective == 'multiclass': results = f1_score(y_test, y_pred, average='weighted') + elif self.model.objective == 'regression': + results = mean_squared_error(y_test, y_pred) return results - def get_skf(self, folds): - if self.lgbm.objective == 'binary': - score_metric = 'roc_auc' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) + def final_fit(self, x_train, y_train): + x_train = self.tabular_preprocessor.encode(x_train) + self.model.fit(x_train, y_train) + +class TabularEstimator(BaseEstimator): + def __init__(self, path=None, verbose=True, time_limit=None): + """ + Initialization function for tabular supervised learner. + """ + self.verbose = verbose + self.path = path + self.time_limit = time_limit + self.objective = None + self.hparams = read_json(self._default_hyperparams) + self.clf = None + self.estimator = None + + def fit(self, x, y): + self.init_model(y) + self.search(x, y) + self.clf.fit(x, y) + self.save_model() + + def predict(self, x, y=None): + y = self.clf.predict(x, ) + return y + + def search(self, x, y, search_iter=40, folds=3): + # Set small sample for hyper-param search + if x.shape[0] > 600: + grid_train_percentage = max(600.0 / x.shape[0], 0.1) else: - score_metric = 'f1_weighted' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) - return score_metric, skf + grid_train_percentage = 1 + grid_n = int(x.shape[0] * grid_train_percentage) + idx = random.sample(list(range(x.shape[0])), grid_n) + grid_train_x, grid_train_y = x[idx, :], y[idx] + + if self.verbose: print(self.hparams) + score_metric, skf = self.get_skf(folds) + random_search = RandomizedSearchCV(self.estimator, param_distributions=self.hparams, n_iter=search_iter, + scoring=score_metric, + n_jobs=1, cv=skf, verbose=0, random_state=1001) + + random_search.fit(grid_train_x, grid_train_y) + self.clf = random_search.best_estimator_ + + return random_search.best_params_ + + @abstractmethod + def save_model(self): + pass + + @abstractmethod + def init_model(self, y): + pass + + @abstractmethod + def get_skf(self, folds): + pass + + def __repr__(self): + return "Estimator model" +class LGBMMixIn: + _default_hyperparams = "lgbm_hp.json" + + def save_model(self): + self.clf.booster_.save_model(self.save_filename) + + def get_feature_importance(self): + if self.estimator: + print('Feature Importance:') + print(self.clf.feature_importances_) + +class SklearnMixIn: + + def save_model(self): + dump(self.clf, self.save_filename) + + def load_model(self): + self.clf = load(self.save_filename) \ No newline at end of file diff --git a/autokaggle/utils.py b/autokaggle/utils.py index 62b833f..cfb3deb 100644 --- a/autokaggle/utils.py +++ b/autokaggle/utils.py @@ -25,3 +25,11 @@ def rand_temp_folder_generator(): path = sys_temp + '_' + random_suffix ensure_dir(path) return path + +def write_json(data, filename): + with open(filename, 'w') as outfile: + json.dump(data, outfile) + +def read_json(filename): + with open(filename, 'rb') as infile: + return json.load(infile) diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py index df472a5..cfe0b9f 100644 --- a/examples/tabular_classification_binary.py +++ b/examples/tabular_classification_binary.py @@ -1,5 +1,5 @@ import numpy as np -from autokaggle import TabularClassifier +from autokaggle import AutoKaggle if __name__ == '__main__': ntime, nnum, ncat = 4, 10, 8 @@ -16,9 +16,9 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = TabularClassifier() + clf = AutoKagggle() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) AUC = clf.evaluate(x_test, y_test) - print(AUC) + print(AUC) \ No newline at end of file diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py index 7515841..d62eb51 100644 --- a/examples/tabular_classification_multiclass.py +++ b/examples/tabular_classification_multiclass.py @@ -16,9 +16,9 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = TabularClassifier() + clf = AutoKagggle() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) F1_score = clf.evaluate(x_test, y_test) - print(F1_score) + print(F1_score) \ No newline at end of file diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index 8ba95a8..7a47cc8 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -16,7 +16,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = TabularRegressor() + clf = AutoKagggle(LgbmRegressor) datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) From 289805c6cf9b5fda556e9c4b5d77c450180490a1 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 25 Jul 2019 09:49:03 -0500 Subject: [PATCH 02/31] Refactor --- autokaggle/__init__.py | 4 +- .../{tabular_supervised.py => auto_ml.py} | 100 ++---------------- autokaggle/hparam_space/lgbm_hp.json | 2 +- ...abular_preprocessor.py => preprocessor.py} | 27 +++-- autokaggle/tabular_classifiers.py | 57 ---------- autokaggle/tabular_regressors.py | 34 ------ autokaggle/utils.py | 1 + examples/tabular_classification_binary.py | 6 +- examples/tabular_classification_multiclass.py | 6 +- examples/tabular_regression.py | 6 +- 10 files changed, 44 insertions(+), 199 deletions(-) rename autokaggle/{tabular_supervised.py => auto_ml.py} (52%) rename autokaggle/{tabular_preprocessor.py => preprocessor.py} (94%) delete mode 100644 autokaggle/tabular_classifiers.py delete mode 100644 autokaggle/tabular_regressors.py diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py index 14439bc..3961f6d 100644 --- a/autokaggle/__init__.py +++ b/autokaggle/__init__.py @@ -1,2 +1,2 @@ -from .tabular_preprocessor import * -from .tabular_supervised import * \ No newline at end of file +from autokaggle.auto_ml import AutoKaggle +from autokaggle.estimators import * \ No newline at end of file diff --git a/autokaggle/tabular_supervised.py b/autokaggle/auto_ml.py similarity index 52% rename from autokaggle/tabular_supervised.py rename to autokaggle/auto_ml.py index 64353eb..e2936e5 100644 --- a/autokaggle/tabular_supervised.py +++ b/autokaggle/auto_ml.py @@ -1,20 +1,17 @@ from sklearn.base import BaseEstimator -from tabular_preprocessor import TabularPreprocessor -from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json from abc import abstractmethod import numpy as np import os import random import json - -from lightgbm import LGBMClassifier, LGBMRegressor -from sklearn.model_selection import RandomizedSearchCV -from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error from joblib import dump, load +from autokaggle.preprocessor import TabularPreprocessor +from autokaggle.estimators import * +from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json -class AutoKagggle(BaseEstimator): +class AutoKaggle(BaseEstimator): def __init__(self, estimator_class=LgbmClassifier, path=None, verbose=True): """ Initialization function for tabular supervised learner. @@ -22,7 +19,7 @@ def __init__(self, estimator_class=LgbmClassifier, path=None, verbose=True): self.verbose = verbose self.is_trained = False self.objective = None - self.tabular_preprocessor = None + self.preprocessor = None self.model = None self.estimator_class = estimator_class self.path = path if path is not None else rand_temp_folder_generator() @@ -61,10 +58,10 @@ def fit(self, x, y, time_limit=None, data_info=None): # Init model and preprocessor self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit) - self.tabular_preprocessor = TabularPreprocessor() + self.preprocessor = TabularPreprocessor() # Fit Model and preprocessor - x = self.tabular_preprocessor.fit(x, y, self.time_limit, data_info) + x = self.preprocessor.fit(x, y, self.time_limit, data_info) self.model.fit(x, y) self.model.save_model() self.is_trained = True @@ -78,7 +75,7 @@ def predict(self, x_test): This function should provide predictions of labels on (test) data. The function predict eventually casdn return probabilities or continuous values. """ - x_test = self.tabular_preprocessor.encode(x_test) + x_test = self.preprocessor.encode(x_test) y = self.model.predict(x_test, ) if y is None: raise ValueError("Tabular predictor does not exist") @@ -98,82 +95,5 @@ def evaluate(self, x_test, y_test): return results def final_fit(self, x_train, y_train): - x_train = self.tabular_preprocessor.encode(x_train) - self.model.fit(x_train, y_train) - -class TabularEstimator(BaseEstimator): - def __init__(self, path=None, verbose=True, time_limit=None): - """ - Initialization function for tabular supervised learner. - """ - self.verbose = verbose - self.path = path - self.time_limit = time_limit - self.objective = None - self.hparams = read_json(self._default_hyperparams) - self.clf = None - self.estimator = None - - def fit(self, x, y): - self.init_model(y) - self.search(x, y) - self.clf.fit(x, y) - self.save_model() - - def predict(self, x, y=None): - y = self.clf.predict(x, ) - return y - - def search(self, x, y, search_iter=40, folds=3): - # Set small sample for hyper-param search - if x.shape[0] > 600: - grid_train_percentage = max(600.0 / x.shape[0], 0.1) - else: - grid_train_percentage = 1 - grid_n = int(x.shape[0] * grid_train_percentage) - idx = random.sample(list(range(x.shape[0])), grid_n) - grid_train_x, grid_train_y = x[idx, :], y[idx] - - if self.verbose: print(self.hparams) - score_metric, skf = self.get_skf(folds) - random_search = RandomizedSearchCV(self.estimator, param_distributions=self.hparams, n_iter=search_iter, - scoring=score_metric, - n_jobs=1, cv=skf, verbose=0, random_state=1001) - - random_search.fit(grid_train_x, grid_train_y) - self.clf = random_search.best_estimator_ - - return random_search.best_params_ - - @abstractmethod - def save_model(self): - pass - - @abstractmethod - def init_model(self, y): - pass - - @abstractmethod - def get_skf(self, folds): - pass - - def __repr__(self): - return "Estimator model" -class LGBMMixIn: - _default_hyperparams = "lgbm_hp.json" - - def save_model(self): - self.clf.booster_.save_model(self.save_filename) - - def get_feature_importance(self): - if self.estimator: - print('Feature Importance:') - print(self.clf.feature_importances_) - -class SklearnMixIn: - - def save_model(self): - dump(self.clf, self.save_filename) - - def load_model(self): - self.clf = load(self.save_filename) \ No newline at end of file + x_train = self.preprocessor.encode(x_train) + self.model.fit(x_train, y_train) \ No newline at end of file diff --git a/autokaggle/hparam_space/lgbm_hp.json b/autokaggle/hparam_space/lgbm_hp.json index b694b11..b2f6311 100644 --- a/autokaggle/hparam_space/lgbm_hp.json +++ b/autokaggle/hparam_space/lgbm_hp.json @@ -1 +1 @@ -{"boosting_type": ["gbdt"], "min_split_gain": [0.1], "max_depth": [5, 8, 10], "min_child_weight": [5, 10, 30, 50, 60, 80, 100], "colsample_bytree": [0.6, 0.7], "learning_rate": [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12], "subsample": [0.8], "num_leaves": [80], "n_estimators": [50, 100, 150, 200]} \ No newline at end of file +[{"boosting_type": ["gbdt"], "min_child_weight": [5, 10, 30, 50, 60, 80, 100], "min_split_gain": [0.1], "subsample": [0.8], "colsample_bytree": [0.6, 0.7], "max_depth": [5, 8, 10], "n_estimators": [50], "num_leaves": [80], "learning_rate": [0.3]}, {"learning_rate": [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12], "n_estimators": [100, 150, 200]}] \ No newline at end of file diff --git a/autokaggle/tabular_preprocessor.py b/autokaggle/preprocessor.py similarity index 94% rename from autokaggle/tabular_preprocessor.py rename to autokaggle/preprocessor.py index 1bf6d76..543e99a 100644 --- a/autokaggle/tabular_preprocessor.py +++ b/autokaggle/preprocessor.py @@ -100,7 +100,7 @@ def __init__(self): self.feature_add_cat_cat = 0 self.order_num_cat_pair = {} - self.rest = None + self.selected_cols = None self.budget = None self.data_info = None self.n_time = None @@ -108,10 +108,16 @@ def __init__(self): self.n_cat = None def remove_useless(self, x): - self.rest = np.where(np.max(x, 0) - np.min(x, 0) != 0)[0] - return x[:, self.rest] + """ + Remove the columns whose max_value == min_value + """ + self.selected_cols = np.where(np.max(x, axis=0) - np.min(x, axis=0) != 0)[0] + return x[:, self.selected_cols] def process_time(self, x): + """ + Process the TIME features. Add the difference between consecutive columns as a feature. + """ cols = range(self.n_time) if len(cols) > 10: cols = cols[:10] @@ -122,6 +128,9 @@ def process_time(self, x): return x def extract_data(self, raw_x): + """ + Encodes the catgorical data and returns all numeric values + """ # only get numerical variables ret = np.concatenate([raw_x['TIME'], raw_x['NUM'], raw_x['CAT']], axis=1) n_rows = ret.shape[0] @@ -242,7 +251,7 @@ def fit(self, raw_x, y, time_limit, data_info): # Get Meta-Feature self.budget = time_limit self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x) - print('QQ: {}'.format(self.data_info)) + print('DATA_INFO: {}'.format(self.data_info)) self.n_time = sum(self.data_info == 'TIME') self.n_num = sum(self.data_info == 'NUM') @@ -250,9 +259,9 @@ def fit(self, raw_x, y, time_limit, data_info): self.total_samples = raw_x.shape[0] - print('QQ1: {}'.format(self.n_time)) - print('QQ2: {}'.format(self.n_num)) - print('QQ3: {}'.format(self.n_cat)) + print('#TIME features: {}'.format(self.n_time)) + print('#NUM features: {}'.format(self.n_num)) + print('#CAT features: {}'.format(self.n_cat)) raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'], 'NUM': raw_x[:, self.data_info == 'NUM'], 'CAT': raw_x[:, self.data_info == 'CAT']} @@ -321,8 +330,8 @@ def encode(self, raw_x, time_limit=None): x = self.cat_to_num(x) x = self.process_time(x) - if self.rest is not None: - x = x[:, self.rest] + if self.selected_cols is not None: + x = x[:, self.selected_cols] return x @staticmethod diff --git a/autokaggle/tabular_classifiers.py b/autokaggle/tabular_classifiers.py deleted file mode 100644 index e832e93..0000000 --- a/autokaggle/tabular_classifiers.py +++ /dev/null @@ -1,57 +0,0 @@ -from sklearn.base import BaseEstimator -from tabular_preprocessor import TabularPreprocessor -from tabular_supervised import AutoKaggle, TabularEstimator, LGBMMixIn, SklearnMixIn -from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json -from abc import abstractmethod -import numpy as np -import os -import random -import json - -from lightgbm import LGBMClassifier, LGBMRegressor -from sklearn.model_selection import RandomizedSearchCV -from sklearn.model_selection import StratifiedKFold, KFold -from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error -from joblib import dump, load - -class TabularClassifier(TabularEstimator): - """TabularClassifier class. - It is used for tabular data classification with lightgbm classifier. - """ - def __init__(self, path=None, verbose=True, time_limit=None): - super().__init__(path, verbose, time_limit) - self.objective = 'classification' - - def get_skf(self, folds): - if self.objective == 'binary': - score_metric = 'roc_auc' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) - else: - score_metric = 'f1_weighted' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) - return score_metric, skf - -class SVMClassifier(TabularClassifier, SklearnMixIn): - _default_hyperparams = "svm_hp.json" - - def init_model(self, y): - n_classes = len(set(y)) - self.objective = 'binary' if n_classes == 2 else 'multiclass' - self.estimator = LinearSVC() - -class LgbmClassifier(TabularClassifier, LGBMMixIn): - def init_model(self, y): - n_classes = len(set(y)) - if n_classes == 2: - self.objective = 'binary' - self.estimator = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) - else: - self.objective = 'multiclass' - self.estimator = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - num_class=n_classes, - objective=self.objective) \ No newline at end of file diff --git a/autokaggle/tabular_regressors.py b/autokaggle/tabular_regressors.py deleted file mode 100644 index 1b6d358..0000000 --- a/autokaggle/tabular_regressors.py +++ /dev/null @@ -1,34 +0,0 @@ -from sklearn.base import BaseEstimator -from tabular_preprocessor import TabularPreprocessor -from tabular_supervised import AutoKaggle, TabularEstimator, LGBMMixIn, SklearnMixIn -from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json -from abc import abstractmethod -import numpy as np -import os -import random -import json - -from lightgbm import LGBMClassifier, LGBMRegressor -from sklearn.model_selection import RandomizedSearchCV -from sklearn.model_selection import StratifiedKFold, KFold -from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error -from joblib import dump, load - - -class TabularRegressor(TabularEstimator): - """TabularRegressor class. - It is used for tabular data regression with lightgbm regressor. - """ - def __init__(self, path=None, verbose=True, time_limit=None): - super().__init__(path, verbose, time_limit) - self.objective = 'regression' - - def get_skf(self, folds): - return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001) - -class LgbmRegressor(TabularRegressor, LGBMMixIn): - def init_model(self, y): - self.estimator = LGBMRegressor(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) \ No newline at end of file diff --git a/autokaggle/utils.py b/autokaggle/utils.py index cfb3deb..31479c6 100644 --- a/autokaggle/utils.py +++ b/autokaggle/utils.py @@ -2,6 +2,7 @@ import tempfile import string import random +import json def ensure_dir(directory): diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py index cfe0b9f..b4fda77 100644 --- a/examples/tabular_classification_binary.py +++ b/examples/tabular_classification_binary.py @@ -1,5 +1,7 @@ import numpy as np -from autokaggle import AutoKaggle +import sys +sys.path.append("../autokaggle/") +from autokaggle import * if __name__ == '__main__': ntime, nnum, ncat = 4, 10, 8 @@ -16,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKagggle() + clf = AutoKaggle() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py index d62eb51..19284cd 100644 --- a/examples/tabular_classification_multiclass.py +++ b/examples/tabular_classification_multiclass.py @@ -1,5 +1,7 @@ import numpy as np -from autokaggle import TabularClassifier +import sys +sys.path.append("../autokaggle/") +from autokaggle import * if __name__ == '__main__': ntime, nnum, ncat = 4, 10, 8 @@ -16,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKagggle() + clf = AutoKaggle() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index 7a47cc8..8c1ed50 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -1,5 +1,7 @@ import numpy as np -from autokaggle import TabularRegressor +import sys +sys.path.append("../autokaggle/") +from autokaggle import * if __name__ == '__main__': ntime, nnum, ncat = 4, 10, 8 @@ -16,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKagggle(LgbmRegressor) + clf = AutoKaggle(LgbmRegressor) datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) From 23ddedf46c299294dcab5dd7832b01f80bbed1d0 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 25 Jul 2019 13:23:54 -0500 Subject: [PATCH 03/31] Add Benchmarking script --- autokaggle/utils.py | 14 +++- examples/benchmarking.py | 166 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 4 deletions(-) create mode 100644 examples/benchmarking.py diff --git a/autokaggle/utils.py b/autokaggle/utils.py index 31479c6..e93f7d8 100644 --- a/autokaggle/utils.py +++ b/autokaggle/utils.py @@ -5,6 +5,10 @@ import json +def generate_rand_string(size): + chars = string.ascii_uppercase + string.digits + return ''.join(random.choice(chars) for _ in range(size)) + def ensure_dir(directory): """Create directory if it does not exist.""" if not os.path.exists(directory): @@ -19,11 +23,8 @@ def temp_path_generator(): def rand_temp_folder_generator(): """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras).""" - chars = string.ascii_uppercase + string.digits - size = 6 - random_suffix = ''.join(random.choice(chars) for _ in range(size)) sys_temp = temp_path_generator() - path = sys_temp + '_' + random_suffix + path = sys_temp + '_' + generate_rand_string(6) ensure_dir(path) return path @@ -34,3 +35,8 @@ def write_json(data, filename): def read_json(filename): with open(filename, 'rb') as infile: return json.load(infile) + +def write_csv(filename, line): + with open(filename, "a") as f: + f.write(", ".join(map(str, line))) + f.write("\n") diff --git a/examples/benchmarking.py b/examples/benchmarking.py new file mode 100644 index 0000000..860b48b --- /dev/null +++ b/examples/benchmarking.py @@ -0,0 +1,166 @@ +import numpy as np +import pandas as pd +import sklearn.model_selection +import sklearn.datasets +from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, balanced_accuracy_score,\ +mean_absolute_error, mean_squared_error +from autosklearn.regression import AutoSklearnRegressor +from autosklearn.classification import AutoSklearnClassifier +from autokaggle import * +from autokaggle.utils import * +import openml +openml.config.apikey = '3c7196c92a274c3b9405a7e26e9f848e' +import warnings +from abc import abstractmethod + +def generate_rand_string(size): + chars = string.ascii_uppercase + string.digits + return ''.join(random.choice(chars) for _ in range(size)) + +class BenchmarkingBase: + results = None + cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", "balanced_accuracy", "F1_score", "AUC"] + rgs_desc = ["automl_model", "task_id", "time_limit", "MSE", "MAE", "R2_score"] + + def __init__(self, supress_warnings=True, sess_name=""): + if supress_warnings: + warnings.filterwarnings('ignore') + self.results = [] + if not sess_name: + sess_name = generate_rand_string(6) + self.cls_results = pd.DataFrame(columns=self.cls_desc) + self.rgs_results = pd.DataFrame(columns=self.rgs_desc) + + def measure_performance_cls(self, y_true, y_pred, binary=False): + accuracy = accuracy_score(y_true, y_pred) + ber = balanced_accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score(y_true, y_pred, average="weighted") + auc = roc_auc_score(y_true, y_pred) if binary else "-" + return [accuracy, ber, f1, auc] + + def measure_performance_rgs(self, y_true, y_pred): + mse = mean_squared_error(y_true, y_pred) + mae = mean_absolute_error(y_true, y_pred) + r2 = r2_score(y_true, y_pred) + return [mse, mae, r2] + + def export_results(self): + self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False) + self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", index=False) + + @abstractmethod + def evaluate(self, task, time_limit): + pass + + def run_automation(self, task_list, time_limit=10*60): + for task in task_list: + try: + self.evaluate(task, time_limit=time_limit) + except: + print("task: {} didnt work".format(task)) + + def time_lapse(self, task_id, time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]): + tl_results = [] + for time_limit in time_limits: + tl_results.append(self.evaluate(task_id, time_limit=time_limit)) + return tl_results + + +class BenchmarkingAutoKaggle(BenchmarkingBase): + def get_data_info(self, dataset, num_cols): + nominal_feat = dataset.get_features_by_type('nominal') + numerical_feat = dataset.get_features_by_type('numeric') + string_feat = dataset.get_features_by_type('string') + date_feat = dataset.get_features_by_type('date') + + data_info = [] + for i in range(num_cols): + if i in date_feat: + data_info.append("TIM") + elif i in numerical_feat: + data_info.append("NUM") + else: + data_info.append("CAT") + return np.array(data_info) + + def evaluate(self, task_id, time_limit=10*60): + task_info = ["autokaggle", task_id, time_limit] + task = openml.tasks.get_task(task_id) + train_indices, test_indices = task.get_train_test_split_indices() + dataset = task.get_dataset() + X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array') + + x_train, y_train = X[train_indices], y[train_indices] + x_test, y_test = X[test_indices], y[test_indices] + + # Create feature type list from openml.org indicator + data_info = self.get_data_info(dataset, len(attribute_names)) + + # Train + if task.task_type == 'Supervised Classification': + automl = AutoKaggle() + elif task.task_type == 'Supervised Regression': + automl = AutoKaggle(LgbmRegressor) + else: + print("UNSUPPORTED TASK_TYPE") + assert(0) + + automl.fit(x_train, y_train, time_limit=time_limit, data_info=data_info) + + # Evaluate + y_hat = automl.predict(x_test) + + if task.task_type == 'Supervised Classification': + is_binary = True if len(task.class_labels) <= 2 else False + result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary) + self.cls_results.loc[len(self.cls_results)] = result + elif task.task_type == 'Supervised Regression': + result = task_info + self.measure_performance_rgs(y_test, y_hat) + self.rgs_results.loc[len(sel.rgs_results)] = result + print(result) + return result + + +class BenchmarkingAutoSklearn(BenchmarkingBase): + def get_data_info(self, categorical_indicator): + return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator] + + def evaluate(self, task_id, time_limit=10*60): + task_info = ["autosklearn", task_id, time_limit] + task = openml.tasks.get_task(task_id) + train_indices, test_indices = task.get_train_test_split_indices() + dataset = task.get_dataset() + X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array') + + x_train, y_train = X[train_indices], y[train_indices] + x_test, y_test = X[test_indices], y[test_indices] + + # Create feature type list from openml.org indicator + feat_type = self.get_data_info(categorical_indicator) + + # Train + if task.task_type == 'Supervised Classification': + automl = AutoSklearnClassifier( + time_left_for_this_task=time_limit, + per_run_time_limit=time_limit//10, **kwargs) + elif task.task_type == 'Supervised Regression': + automl = AutoSklearnRegressor( + time_left_for_this_task=time_limit, + per_run_time_limit=time_limit//10, **kwargs) + else: + print("UNSUPPORTED TASK_TYPE") + assert(0) + + automl.fit(x_train, y_train, feat_type=feat_type) + + y_hat = automl.predict(x_test) + if task.task_type == 'Supervised Classification': + is_binary = True if len(task.class_labels) <= 2 else False + result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary) + self.cls_results.loc[len(self.cls_results)] = result + elif task.task_type == 'Supervised Regression': + result = task_info + self.measure_performance_rgs(y_test, y_hat) + self.rgs_results.loc[len(self.rgs_results)] = result + self.results.append(result) + print(result) + return result \ No newline at end of file From fadf0ab3172e756136f3fde828d8cfa4d73b780d Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 25 Jul 2019 14:39:30 -0500 Subject: [PATCH 04/31] Add estimators --- autokaggle/ensemblers.py | 66 ++--------- autokaggle/estimators.py | 182 +++++++++++++++++++++++++++++ autokaggle/hparam_space/rf_hp.json | 2 +- autokaggle/preprocessor.py | 33 +++++- examples/benchmarking.py | 122 +++++++++++++++++-- 5 files changed, 338 insertions(+), 67 deletions(-) create mode 100644 autokaggle/estimators.py diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index 9081211..532072e 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -1,13 +1,13 @@ from sklearn.base import BaseEstimator -from tabular_preprocessor import TabularPreprocessor -from utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +from autokaggle.preprocessor import TabularPreprocessor +from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json from abc import abstractmethod import numpy as np import os import random import json +from statistics import mode -from lightgbm import LGBMClassifier, LGBMRegressor from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error @@ -34,26 +34,11 @@ def predict(self, predictions): return predictions.apply(mode, axis=1).values class EnsembleSelection: - def __init__(self, task_type, metric, mode='fast'): - self.ensemble_size = None - self.mode = mode - self.task_type = task_type - self.metric = metric + def __init__(self, ensemble_size=25): + self.ensemble_size = ensemble_size def fit(self, predictions, y_true): - if self.mode == 'fast': - self.fast(predictions, y_true) - else: - self.slow(predictions, y_true) - self._calculate_weights() - - def predict(self, predictions): - return np.average(predictions, axis=0, weights=self.weights_) - - def fast(self, predictions, labels): - """Fast version of Rich Caruana's ensemble selection method.""" - self.ensemble_size = len(predictions) - + """Rich Caruana's ensemble selection method. (fast version)""" ensemble = [] trajectory = [] order = [] @@ -94,39 +79,6 @@ def fast(self, predictions, labels): break self.indices_ = order - - def slow(self, predictions, labels): - """Rich Caruana's ensemble selection method.""" - self.ensemble_size = len(predictions) - - ensemble = [] - trajectory = [] - order = [] - - for i in range(self.ensemble_size): - scores = np.zeros([predictions.shape[0]]) - for j, pred in enumerate(predictions): - ensemble.append(pred) - ensemble_prediction = np.mean(np.array(ensemble), axis=0) - scores[j] = calculate_score( - solution=labels, - prediction=ensemble_prediction, - task_type=self.task_type, - metric=self.metric, - all_scoring_functions=False) - ensemble.pop() - best = np.nanargmin(scores) - ensemble.append(predictions[best]) - trajectory.append(scores[best]) - order.append(best) - - # Handle special case - if len(predictions) == 1: - break - - self.indices_ = np.array(order) - - def _calculate_weights(self): ensemble_members = Counter(self.indices_).most_common() weights = np.zeros((self.ensemble_size,), dtype=float) for ensemble_member in ensemble_members: @@ -136,4 +88,8 @@ def _calculate_weights(self): if np.sum(weights) < 1: weights = weights / np.sum(weights) - self.weights_ = weights \ No newline at end of file + self.weights_ = weights + self._calculate_weights() + + def predict(self, predictions): + return np.average(predictions, axis=1, weights=self.weights_) diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py new file mode 100644 index 0000000..d9907ac --- /dev/null +++ b/autokaggle/estimators.py @@ -0,0 +1,182 @@ +from sklearn.base import BaseEstimator +from abc import abstractmethod +import numpy as np +import os +import random +import json + +from lightgbm import LGBMClassifier, LGBMRegressor +from sklearn.model_selection import RandomizedSearchCV +from sklearn.model_selection import StratifiedKFold, KFold +from sklearn.svm import SVC +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error +from joblib import dump, load + +from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json + +class TabularEstimator(BaseEstimator): + def __init__(self, path=None, verbose=True, time_limit=None): + """ + Initialization function for tabular supervised learner. + """ + self.verbose = verbose + self.path = path + self.time_limit = time_limit + self.objective = None + abs_cwd = os.path.split(os.path.abspath(__file__))[0] + self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams) + self.clf = None + self.estimator = None + + def fit(self, x, y): + self.init_model(y) + self.search(x, y) + self.clf.fit(x, y) + self.save_model() + + def predict(self, x, y=None): + y = self.clf.predict(x, ) + return y + + def search(self, x, y, search_iter=40, folds=3): + # Set small sample for hyper-param search + if x.shape[0] > 600: + grid_train_percentage = max(600.0 / x.shape[0], 0.1) + else: + grid_train_percentage = 1 + grid_n = int(x.shape[0] * grid_train_percentage) + idx = random.sample(list(range(x.shape[0])), grid_n) + grid_train_x, grid_train_y = x[idx, :], y[idx] + + if type(self.hparams) != list: + self.hparams = [self.hparams] + + best_params = {} + for idx, search_space in enumerate(self.hparams): + best_params.update(search_space) + if self.verbose: + print("Step: {}".format(idx+1)) + print("Search space:") + print(best_params) + score_metric, skf = self.get_skf(folds) + random_search = RandomizedSearchCV(self.estimator, param_distributions=best_params, n_iter=search_iter, + scoring=score_metric, + n_jobs=1, cv=skf, verbose=0, random_state=1001, iid=False) + random_search.fit(grid_train_x, grid_train_y) + best_params = random_search.best_params_ + for key, value in best_params.items(): + best_params[key] = [value] + + self.clf = random_search.best_estimator_ + + return random_search.best_params_ + + @abstractmethod + def save_model(self): + pass + + @abstractmethod + def init_model(self, y): + pass + + @abstractmethod + def get_skf(self, folds): + pass + + def __repr__(self): + return "TabularEstimator model" + + +class Classifier(TabularEstimator): + """Classifier class. + It is used for tabular data classification with lightgbm classifier. + """ + def __init__(self, path=None, verbose=True, time_limit=None): + super().__init__(path, verbose, time_limit) + self.objective = 'classification' + + def get_skf(self, folds): + if self.objective == 'binary': + score_metric = 'roc_auc' + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) + else: + score_metric = 'f1_weighted' + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) + return score_metric, skf + + +class Regressor(TabularEstimator): + """Regressor class. + It is used for tabular data regression with lightgbm regressor. + """ + def __init__(self, path=None, verbose=True, time_limit=None): + super().__init__(path, verbose, time_limit) + self.objective = 'regression' + + def get_skf(self, folds): + return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001) + + +class LGBMMixIn: + _default_hyperparams = "lgbm_hp.json" + + def save_model(self): + self.clf.booster_.save_model(self.save_filename) + + def get_feature_importance(self): + if self.estimator: + print('Feature Importance:') + print(self.clf.feature_importances_) + + +class SklearnMixIn: + + def save_model(self): + dump(self.clf, self.save_filename) + + def load_model(self): + self.clf = load(self.save_filename) + + +class SVMClassifier(Classifier, SklearnMixIn): + _default_hyperparams = "svm_hp.json" + + def init_model(self, y): + n_classes = len(set(y)) + self.objective = 'binary' if n_classes == 2 else 'multiclass' + self.estimator = SVC() + + +class RFClassifier(Classifier, SklearnMixIn): + _default_hyperparams = "rf_hp.json" + + def init_model(self, y): + n_classes = len(set(y)) + self.objective = 'binary' if n_classes == 2 else 'multiclass' + self.estimator = RandomForestClassifier() + +class LgbmClassifier(Classifier, LGBMMixIn): + def init_model(self, y): + n_classes = len(set(y)) + if n_classes == 2: + self.objective = 'binary' + self.estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + objective=self.objective) + else: + self.objective = 'multiclass' + self.estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + num_class=n_classes, + objective=self.objective) + + +class LgbmRegressor(Regressor, LGBMMixIn): + def init_model(self, y): + self.estimator = LGBMRegressor(silent=False, + verbose=-1, + n_jobs=1, + objective=self.objective) \ No newline at end of file diff --git a/autokaggle/hparam_space/rf_hp.json b/autokaggle/hparam_space/rf_hp.json index a56221a..c23c577 100644 --- a/autokaggle/hparam_space/rf_hp.json +++ b/autokaggle/hparam_space/rf_hp.json @@ -1 +1 @@ -{"criterion": ["entropy", "gini"], "max_features": [0.1, 0.3, 0.5, 0.7, 1.0], "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10], "min_samples_split": [2, 3, 4], "min_samples_leaf": [1, 2, 3], "n_estimators": [50, 100, 150, 200]} \ No newline at end of file +[{"criterion": ["entropy", "gini"], "max_features": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "n_estimators": [300], "min_samples_leaf": [1]}, {"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {"n_estimators": [50, 100, 150, 200]}] \ No newline at end of file diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 543e99a..9509261 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -1,6 +1,9 @@ import numpy as np from pandas import DataFrame +import scipy from scipy.stats import pearsonr +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler LEVEL_HIGH = 32 @@ -96,8 +99,8 @@ def __init__(self): self.high_level_cat_keys = [] self.feature_add_high_cat = 0 - self.feature_add_cat_num = 0 - self.feature_add_cat_cat = 0 + self.feature_add_cat_num = 10 + self.feature_add_cat_cat = 10 self.order_num_cat_pair = {} self.selected_cols = None @@ -106,6 +109,8 @@ def __init__(self): self.n_time = None self.n_num = None self.n_cat = None + self.pca = None + self.scaler = None def remove_useless(self, x): """ @@ -262,6 +267,9 @@ def fit(self, raw_x, y, time_limit, data_info): print('#TIME features: {}'.format(self.n_time)) print('#NUM features: {}'.format(self.n_num)) print('#CAT features: {}'.format(self.n_cat)) + + # Convert sparse to dense if needed + raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'], 'NUM': raw_x[:, self.data_info == 'NUM'], 'CAT': raw_x[:, self.data_info == 'CAT']} @@ -292,7 +300,17 @@ def fit(self, raw_x, y, time_limit, data_info): # Encode high-order categorical data to numerical with frequency x = self.cat_to_num(x, y) - + + # Standardize numeric columns + if self.n_num > 0: + self.scaler = StandardScaler() + x[:, :self.n_num] = self.scaler.fit_transform(x[:, :self.n_num]) + + # PCA for feature generation + self.pca = PCA(n_components=0.99, svd_solver='full') + x_pca = self.pca.fit_transform(x[:, :self.n_num]) + x = np.concatenate([x, x_pca], axis=1) + x = self.process_time(x) x = self.remove_useless(x) @@ -318,6 +336,8 @@ def encode(self, raw_x, time_limit=None): else: self.budget = time_limit + # Convert sparse to dense if needed + raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'], 'NUM': raw_x[:, self.data_info == 'NUM'], 'CAT': raw_x[:, self.data_info == 'CAT']} @@ -330,6 +350,13 @@ def encode(self, raw_x, time_limit=None): x = self.cat_to_num(x) x = self.process_time(x) + + if self.scaler: + x[:, :self.n_num] = self.scaler.transform(x[:, :self.n_num]) + if self.pca: + x_pca = self.pca.transform(x[:, :self.n_num]) + x = np.concatenate([x, x_pca], axis=1) + if self.selected_cols is not None: x = x[:, self.selected_cols] return x diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 860b48b..c62b1dc 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -1,3 +1,5 @@ +import sys +sys.path.append("../") import numpy as np import pandas as pd import sklearn.model_selection @@ -26,8 +28,7 @@ def __init__(self, supress_warnings=True, sess_name=""): if supress_warnings: warnings.filterwarnings('ignore') self.results = [] - if not sess_name: - sess_name = generate_rand_string(6) + self.sess_name = generate_rand_string(6) if not sess_name else sess_name self.cls_results = pd.DataFrame(columns=self.cls_desc) self.rgs_results = pd.DataFrame(columns=self.rgs_desc) @@ -45,8 +46,10 @@ def measure_performance_rgs(self, y_true, y_pred): return [mse, mae, r2] def export_results(self): - self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False) - self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", index=False) + if len(self.cls_results) > 0: + self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False) + if len(self.rgs_results) > 0: + self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", index=False) @abstractmethod def evaluate(self, task, time_limit): @@ -65,8 +68,23 @@ def time_lapse(self, task_id, time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 24 tl_results.append(self.evaluate(task_id, time_limit=time_limit)) return tl_results + def get_dataset_splits(self, task_id): + task = openml.tasks.get_task(task_id) + train_indices, test_indices = task.get_train_test_split_indices() + dataset = task.get_dataset() + X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array') + + x_train, y_train = X[train_indices], y[train_indices] + x_test, y_test = X[test_indices], y[test_indices] + return x_train, y_train, x_test, y_test + class BenchmarkingAutoKaggle(BenchmarkingBase): + estimator_type = None + + def set_estimator_type(self, est_type): + self.estimator_type = est_type + def get_data_info(self, dataset, num_cols): nominal_feat = dataset.get_features_by_type('nominal') numerical_feat = dataset.get_features_by_type('numeric') @@ -98,9 +116,9 @@ def evaluate(self, task_id, time_limit=10*60): # Train if task.task_type == 'Supervised Classification': - automl = AutoKaggle() + automl = AutoKaggle() if not self.estimator_type else AutoKaggle(self.estimator_type) elif task.task_type == 'Supervised Regression': - automl = AutoKaggle(LgbmRegressor) + automl = AutoKaggle(LgbmRegressor) if not self.estimator_type else AutoKaggle(self.estimator_type) else: print("UNSUPPORTED TASK_TYPE") assert(0) @@ -116,7 +134,7 @@ def evaluate(self, task_id, time_limit=10*60): self.cls_results.loc[len(self.cls_results)] = result elif task.task_type == 'Supervised Regression': result = task_info + self.measure_performance_rgs(y_test, y_hat) - self.rgs_results.loc[len(sel.rgs_results)] = result + self.rgs_results.loc[len(self.rgs_results)] = result print(result) return result @@ -163,4 +181,92 @@ def evaluate(self, task_id, time_limit=10*60): self.rgs_results.loc[len(self.rgs_results)] = result self.results.append(result) print(result) - return result \ No newline at end of file + return result + +def get_dataset_ids(task_ids): + if type(task_ids) == list: + return [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids] + else: + return openml.tasks.get_task(task_ids).dataset_id + +def get_task_info(task_ids): + task_types = [] + dataset_list = [] + for i, t_id in enumerate(task_ids): + task = openml.tasks.get_task(t_id) + dataset = openml.datasets.get_dataset(task.dataset_id) + if task.task_type_id == 1: + _, y, _, _ = dataset.get_data(target=task.target_name, dataset_format='array') + task_type = "Binary Classification" if len(set(y)) <= 2 else "Multiclass classification ({})".format(len(set(y))) + else: + task_type = "Regression" + task_types.append(task_type) + dataset_list.append(dataset) + return dataset_list, task_types + +def get_dataset_properties(task_ids): + dataset_list, task_types = get_task_info(task_ids) + df = pd.DataFrame(columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", "#Date"]) + for i, dataset in enumerate(dataset_list): + df.loc[i] = [ + dataset.name, + dataset.qualities["NumberOfInstances"], + task_types[i], + len(dataset.get_features_by_type('numeric')), + len(dataset.get_features_by_type('nominal')), + len(dataset.get_features_by_type('string')), + len(dataset.get_features_by_type('date')), + ] + return df + +def get_performance_table(filename, metric): + """ + Read the results csv and convert into the performance table based on the median of the results for each task. + """ + test = pd.read_csv(filename) + perf = pd.DataFrame(columns=["Name", "AutoKaggle", "AutoSklearn", "H2O.ai"]) + task_ids = list(set(test["task_id"])) + dataset_ids = get_dataset_ids(task_ids) + + test = test.set_index(["task_id", "automl_model"]) + test.sort_index(inplace=True) + for i, t_id in enumerate(task_ids): + try: + name = openml.datasets.get_dataset(dataset_ids[i]).name + auto_kaggle = test.loc[(t_id, "autokaggle")][metric].median() if (t_id, "autokaggle") in test.index else np.nan + auto_sklearn = test.loc[(t_id, "autosklearn")][metric].median() if (t_id, "autosklearn") in test.index else np.nan + h2o_ai = test.loc[(t_id, "autosklearn")][metric].median() if (t_id, "autosklearn") in test.index else np.nan + perf.loc[i] = [name, auto_kaggle, auto_sklearn, h2o_ai] + except Exception as e: + print(e) + return perf + +def style_results(res): + """ + Highlight the max results and set index to name + """ + def highlight_max(s): + ''' + highlight the maximum in a Series yellow. + ''' + is_max = s == s.max() + return ['background-color: yellow' if v else '' for v in is_max] + res = res.set_index("Name") + res.style.apply(highlight_max, axis=1) + return res + +import statistics +def get_box_plot(data, task_id, metric): + """ + Plots the boxplot of variance + """ + auto_sklearn = list(data.loc[(task_id, "autosklearn")][metric]) + auto_kaggle = list(data.loc[(task_id, "autokaggle")][metric]) + med_sk = statistics.median(auto_sklearn) + med_ak = statistics.median(auto_kaggle) + while len(auto_sklearn) < len(auto_kaggle): + auto_sklearn.append(med_sk) + while len(auto_sklearn) > len(auto_kaggle): + auto_kaggle.append(med_ak) + temp = pd.DataFrame(data={"Autokaggle":auto_kaggle, "AutoSklearn":auto_sklearn}) + temp.boxplot() From 1f800e1902fbe6efe522b62c6572654e51d3b1a8 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 29 Aug 2019 13:54:54 -0500 Subject: [PATCH 05/31] Fixed the tests --- autokaggle/auto_ml.py | 14 +++++++------- autokaggle/estimators.py | 12 +++++++++--- examples/tabular_classification_binary.py | 3 ++- examples/tabular_classification_multiclass.py | 3 ++- examples/tabular_regression.py | 4 ++-- 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index e2936e5..6302d32 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -11,17 +11,17 @@ from autokaggle.estimators import * from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json + class AutoKaggle(BaseEstimator): - def __init__(self, estimator_class=LgbmClassifier, path=None, verbose=True): + def __init__(self, estimator=LgbmClassifier(), preprocessor=TabularPreprocessor(), path=None, verbose=True): """ Initialization function for tabular supervised learner. """ self.verbose = verbose self.is_trained = False self.objective = None - self.preprocessor = None - self.model = None - self.estimator_class = estimator_class + self.preprocessor = preprocessor + self.model = estimator self.path = path if path is not None else rand_temp_folder_generator() ensure_dir(self.path) if self.verbose: @@ -56,9 +56,9 @@ def fit(self, x, y, time_limit=None, data_info=None): x = np.concatenate([x, x], axis=0) y = np.concatenate([y, y], axis=0) - # Init model and preprocessor - self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit) - self.preprocessor = TabularPreprocessor() + # # Init model and preprocessor + # self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit) + # self.preprocessor = TabularPreprocessor() # Fit Model and preprocessor x = self.preprocessor.fit(x, y, self.time_limit, data_info) diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index d9907ac..0d4e73d 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -15,6 +15,7 @@ from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json + class TabularEstimator(BaseEstimator): def __init__(self, path=None, verbose=True, time_limit=None): """ @@ -39,16 +40,21 @@ def predict(self, x, y=None): y = self.clf.predict(x, ) return y - def search(self, x, y, search_iter=40, folds=3): + @staticmethod + def subsample(x, y, sample_percent): # Set small sample for hyper-param search if x.shape[0] > 600: - grid_train_percentage = max(600.0 / x.shape[0], 0.1) + grid_train_percentage = max(600.0 / x.shape[0], sample_percent) else: grid_train_percentage = 1 grid_n = int(x.shape[0] * grid_train_percentage) idx = random.sample(list(range(x.shape[0])), grid_n) grid_train_x, grid_train_y = x[idx, :], y[idx] - + return grid_train_x, grid_train_y + + def search(self, x, y, search_iter=40, folds=3): + grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1) + if type(self.hparams) != list: self.hparams = [self.hparams] diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py index b4fda77..0ebd91f 100644 --- a/examples/tabular_classification_binary.py +++ b/examples/tabular_classification_binary.py @@ -1,6 +1,7 @@ import numpy as np import sys -sys.path.append("../autokaggle/") + +sys.path.append("..") from autokaggle import * if __name__ == '__main__': diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py index 19284cd..13612f6 100644 --- a/examples/tabular_classification_multiclass.py +++ b/examples/tabular_classification_multiclass.py @@ -1,6 +1,7 @@ import numpy as np import sys -sys.path.append("../autokaggle/") +sys.path.append("..") +# print(sys.path) from autokaggle import * if __name__ == '__main__': diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index 8c1ed50..ccb2064 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -1,6 +1,6 @@ import numpy as np import sys -sys.path.append("../autokaggle/") +sys.path.append("..") from autokaggle import * if __name__ == '__main__': @@ -18,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggle(LgbmRegressor) + clf = AutoKaggle(LgbmRegressor()) datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) From a7b66c332f6c96dee2d0932fdefc5f75a9fc1b80 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 29 Aug 2019 20:42:27 -0500 Subject: [PATCH 06/31] Use hyper-opt for search --- autokaggle/auto_ml.py | 8 +-- autokaggle/estimators.py | 99 ++++++++++++++++++++++++++------------ autokaggle/preprocessor.py | 8 +-- 3 files changed, 75 insertions(+), 40 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 6302d32..c9689c3 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -44,10 +44,7 @@ def fit(self, x, y, time_limit=None, data_info=None): you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. """ - - if time_limit is None: - time_limit = 24 * 60 * 60 - self.time_limit = time_limit + self.time_limit = time_limit if time_limit else 24 * 60 * 60 if x.shape[1] == 0: raise ValueError("No feature exist!") @@ -63,7 +60,6 @@ def fit(self, x, y, time_limit=None, data_info=None): # Fit Model and preprocessor x = self.preprocessor.fit(x, y, self.time_limit, data_info) self.model.fit(x, y) - self.model.save_model() self.is_trained = True if self.verbose: @@ -73,7 +69,7 @@ def fit(self, x, y, time_limit=None, data_info=None): def predict(self, x_test): """ This function should provide predictions of labels on (test) data. - The function predict eventually casdn return probabilities or continuous values. + The function predict eventually can return probabilities or continuous values. """ x_test = self.preprocessor.encode(x_test) y = self.model.predict(x_test, ) diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 0d4e73d..0576430 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -6,14 +6,17 @@ import json from lightgbm import LGBMClassifier, LGBMRegressor -from sklearn.model_selection import RandomizedSearchCV +from sklearn.model_selection import RandomizedSearchCV, cross_val_score from sklearn.model_selection import StratifiedKFold, KFold from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier -from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error +from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer from joblib import dump, load from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +import hyperopt +from hyperopt import tpe, hp, fmin, space_eval class TabularEstimator(BaseEstimator): @@ -27,17 +30,16 @@ def __init__(self, path=None, verbose=True, time_limit=None): self.objective = None abs_cwd = os.path.split(os.path.abspath(__file__))[0] self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams) - self.clf = None - self.estimator = None + self.best_estimator_ = None def fit(self, x, y): self.init_model(y) self.search(x, y) - self.clf.fit(x, y) + self.best_estimator_.fit(x, y) self.save_model() def predict(self, x, y=None): - y = self.clf.predict(x, ) + y = self.best_estimator_.predict(x, ) return y @staticmethod @@ -54,29 +56,64 @@ def subsample(x, y, sample_percent): def search(self, x, y, search_iter=40, folds=3): grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1) + score_metric, skf = self.get_skf(folds) - if type(self.hparams) != list: - self.hparams = [self.hparams] - - best_params = {} - for idx, search_space in enumerate(self.hparams): - best_params.update(search_space) - if self.verbose: - print("Step: {}".format(idx+1)) - print("Search space:") - print(best_params) - score_metric, skf = self.get_skf(folds) - random_search = RandomizedSearchCV(self.estimator, param_distributions=best_params, n_iter=search_iter, - scoring=score_metric, - n_jobs=1, cv=skf, verbose=0, random_state=1001, iid=False) - random_search.fit(grid_train_x, grid_train_y) - best_params = random_search.best_params_ - for key, value in best_params.items(): - best_params[key] = [value] + self.hparams = space = hp.choice('classifier', [ + {'model': KNeighborsClassifier, + 'param': {'n_neighbors': + hp.choice('n_neighbors', range(3, 11)), + 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree']), + 'leaf_size': hp.choice('leaf_size', range(1, 50)), + 'metric': hp.choice('metric', ["euclidean", "manhattan", + "chebyshev", "minkowski" + ])} + }, + {'model': SVC, + 'param': {'C': hp.lognormal('C', 0, 1), + 'kernel': hp.choice('kernel', ['rbf', 'poly', 'rbf', 'sigmoid']), + 'degree': hp.choice('degree', range(1, 15)), + 'gamma': hp.uniform('gamma', 0.001, 10000)} + } + ]) + + def objective_func(args): + clf = args['model'](**args['param']) + loss = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() + print("CV Score:", loss) + print("\n=================") + return 1 - loss + + opt = space_eval(self.hparams, fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, + max_evals=search_iter)) + self.best_estimator_ = opt['model'](**opt['param']) - self.clf = random_search.best_estimator_ + return opt - return random_search.best_params_ + # def search(self, x, y, search_iter=40, folds=3): + # grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1) + # + # if type(self.hparams) != list: + # self.hparams = [self.hparams] + # + # best_params = {} + # for idx, search_space in enumerate(self.hparams): + # best_params.update(search_space) + # if self.verbose: + # print("Step: {}".format(idx+1)) + # print("Search space:") + # print(best_params) + # score_metric, skf = self.get_skf(folds) + # random_search = RandomizedSearchCV(self.estimator, param_distributions=best_params, n_iter=search_iter, + # scoring=score_metric, + # n_jobs=1, cv=skf, verbose=0, random_state=1001, iid=False) + # random_search.fit(grid_train_x, grid_train_y) + # best_params = random_search.best_params_ + # for key, value in best_params.items(): + # best_params[key] = [value] + # + # self.best_estimator_ = random_search.best_estimator_ + # + # return random_search.best_params_ @abstractmethod def save_model(self): @@ -128,21 +165,21 @@ class LGBMMixIn: _default_hyperparams = "lgbm_hp.json" def save_model(self): - self.clf.booster_.save_model(self.save_filename) + self.best_estimator_.booster_.save_model(self.save_filename) def get_feature_importance(self): - if self.estimator: + if self.best_estimator_: print('Feature Importance:') - print(self.clf.feature_importances_) + print(self.best_estimator_.feature_importances_) class SklearnMixIn: def save_model(self): - dump(self.clf, self.save_filename) + dump(self.best_estimator_, self.save_filename) def load_model(self): - self.clf = load(self.save_filename) + self.best_estimator_ = load(self.save_filename) class SVMClassifier(Classifier, SklearnMixIn): diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 9509261..a9d56bf 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -253,15 +253,15 @@ def fit(self, raw_x, y, time_limit, data_info): 'TIME' for temporal feature, 'NUM' for other numerical feature, and 'CAT' for categorical feature. """ - # Get Meta-Feature self.budget = time_limit + # Extract or read data info self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x) print('DATA_INFO: {}'.format(self.data_info)) + # Set the meta info for each data type self.n_time = sum(self.data_info == 'TIME') self.n_num = sum(self.data_info == 'NUM') self.n_cat = sum(self.data_info == 'CAT') - self.total_samples = raw_x.shape[0] print('#TIME features: {}'.format(self.n_time)) @@ -270,11 +270,13 @@ def fit(self, raw_x, y, time_limit, data_info): # Convert sparse to dense if needed raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x + + # convert to a dictionary of different datatype raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'], 'NUM': raw_x[:, self.data_info == 'NUM'], 'CAT': raw_x[:, self.data_info == 'CAT']} - + # Init the label encoders for each categorical column for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat): self.cat_to_int_label[col_index] = {} From 27ed4b8e39a153586124de01d4d13088d8abd1c3 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Mon, 2 Sep 2019 17:56:33 -0500 Subject: [PATCH 07/31] Added code for primitives --- autokaggle/auto_ml.py | 9 +- autokaggle/preprocessor.py | 538 ++++++++++++++++++------------------- 2 files changed, 265 insertions(+), 282 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index c9689c3..f77e004 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -58,7 +58,8 @@ def fit(self, x, y, time_limit=None, data_info=None): # self.preprocessor = TabularPreprocessor() # Fit Model and preprocessor - x = self.preprocessor.fit(x, y, self.time_limit, data_info) + self.preprocessor.fit(x, y, self.time_limit, data_info) + x = self.preprocessor.transform(x) self.model.fit(x, y) self.is_trained = True @@ -71,7 +72,7 @@ def predict(self, x_test): This function should provide predictions of labels on (test) data. The function predict eventually can return probabilities or continuous values. """ - x_test = self.preprocessor.encode(x_test) + x_test = self.preprocessor.transform(x_test) y = self.model.predict(x_test, ) if y is None: raise ValueError("Tabular predictor does not exist") @@ -91,5 +92,5 @@ def evaluate(self, x_test, y_test): return results def final_fit(self, x_train, y_train): - x_train = self.preprocessor.encode(x_train) - self.model.fit(x_train, y_train) \ No newline at end of file + x_train = self.preprocessor.transform(x_train) + self.model.fit(x_train, y_train) diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index a9d56bf..f155a73 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -4,87 +4,16 @@ from scipy.stats import pearsonr from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler - +from sklearn.base import TransformerMixin +from sklearn.base import BaseEstimator +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from abc import abstractmethod +import collections +from lightgbm import LGBMClassifier, LGBMRegressor LEVEL_HIGH = 32 -def parallel_function(labels, first_batch_keys, task): - if task == 'label': - if min(labels) > first_batch_keys: - labels = labels - np.min(labels) - return labels.reshape(labels.shape[0], 1) - - elif task == 'frequency': - cat_dict = {} - n_rows = labels.shape[0] - labels = np.expand_dims(labels, axis=1) - - if min(labels) > first_batch_keys: - labels = labels - np.min(labels) - - frequencies = np.zeros((n_rows, 1)) - - for row_index in range(n_rows): - key = labels[row_index, 0] - if key in cat_dict: - cat_dict[key] += 1 - else: - cat_dict[key] = 1 - - n_level = len(cat_dict) - key_to_frequency = {} - - for key in cat_dict.keys(): - key_to_frequency[key] = cat_dict[key] / n_rows * n_level - - for row_index in range(n_rows): - key = labels[row_index, 0] - frequencies[row_index][0] = key_to_frequency[key] - - return frequencies - elif task == 'num_cat': - df = DataFrame(data=labels) - return df.join(df.groupby(1)[0].mean(), - rsuffix='r', - on=1).values[:, -1:] - elif task == 'cat_cat': - df = DataFrame(data=labels) - df[3] = list(range(len(labels))) - return df.join(df.groupby([0, 1]).count(), - rsuffix='r', - on=(0, 1)).values[:, -1:] - elif task == 'train_num_cat': - y = first_batch_keys[0] - df = DataFrame(data=labels) - fe = df.join(df.groupby(1)[0].mean(), - rsuffix='r', - on=1).values[:, -1:] - mu = abs(pearsonr(np.squeeze(np.array(fe)), y)[0]) - if np.isnan(mu): - mu = 0 - return [[first_batch_keys[1], first_batch_keys[2], mu, mu], first_batch_keys[3]] - - elif task == 'train_cat_cat': - y = first_batch_keys[0] - df = DataFrame(data=labels) - df[3] = list(range(len(labels))) - fe = df.join(df.groupby([0, 1]).count(), - rsuffix='r', - on=(0, 1)).values[:, -1:] - mu = abs(pearsonr(np.squeeze(np.array(fe)), y)[0]) - if np.isnan(mu): - mu = 0 - return [[first_batch_keys[1], first_batch_keys[2], mu], first_batch_keys[3]] - return None - - -def call_parallel(tasks): - results = [] - for t in tasks: - results.append(parallel_function(t[0], t[1], t[2])) - return results - - class TabularPreprocessor: def __init__(self): """ @@ -109,136 +38,10 @@ def __init__(self): self.n_time = None self.n_num = None self.n_cat = None - self.pca = None - self.scaler = None - - def remove_useless(self, x): - """ - Remove the columns whose max_value == min_value - """ - self.selected_cols = np.where(np.max(x, axis=0) - np.min(x, axis=0) != 0)[0] - return x[:, self.selected_cols] - - def process_time(self, x): - """ - Process the TIME features. Add the difference between consecutive columns as a feature. - """ - cols = range(self.n_time) - if len(cols) > 10: - cols = cols[:10] - x_time = x[:, cols] - for i in cols: - for j in range(i + 1, len(cols)): - x = np.append(x, np.expand_dims(x_time[:, i] - x_time[:, j], 1), 1) - return x - - def extract_data(self, raw_x): - """ - Encodes the catgorical data and returns all numeric values - """ - # only get numerical variables - ret = np.concatenate([raw_x['TIME'], raw_x['NUM'], raw_x['CAT']], axis=1) - n_rows = ret.shape[0] - n_num_col = ret.shape[1] - self.n_cat - - n_cat_col = self.n_cat - if n_cat_col <= 0: - return ret.astype(np.float64) - - # preprocess (multi-value) categorical data - for col_index in range(n_num_col, n_num_col + n_cat_col): - for row_index in range(n_rows): - key = str(ret[row_index, col_index]) - if key in self.cat_to_int_label[col_index]: - ret[row_index, col_index] = self.cat_to_int_label[col_index][key] - continue - new_value = len(self.cat_to_int_label[col_index]) - self.cat_to_int_label[col_index][key] = new_value - ret[row_index, col_index] = new_value - - return ret.astype(np.float64) - - def cat_to_num(self, x, y=None): - if y is not None: - mark = self.n_time + self.n_num - - for col_index in range(self.n_time + self.n_num, self.n_time + self.n_num + self.n_cat): - if self.n_first_batch_keys[col_index] <= LEVEL_HIGH: - self.num_cat_pair[mark] = (col_index,) - mark += 1 - else: - self.num_cat_pair[mark] = (col_index, col_index) - mark += 1 - - mark_1 = 0 - tasks = [] - for i, cat_col_index1 in enumerate(self.high_level_cat_keys): - for cat_col_index2 in self.high_level_cat_keys[i + 1:]: - tasks.append((x[:, (cat_col_index1, cat_col_index2)], - [y, cat_col_index1, cat_col_index2, mark_1], - 'train_cat_cat')) - mark_1 += 1 - - all_results = call_parallel(tasks) - - num_cat_pair_1 = {} - pearsonr_dict_1 = {} - for result in all_results: - if result[0][-1] > 0.001: - pearsonr_dict_1[result[1]] = result[0][-1] - num_cat_pair_1[result[1]] = result[0] - pearsonr_high_1 = sorted(pearsonr_dict_1, key=pearsonr_dict_1.get, reverse=True)[:self.feature_add_cat_cat] - num_cat_pair_1 = {key: num_cat_pair_1[key] for key in pearsonr_high_1} - num_cat_pair_1 = {i + mark: num_cat_pair_1[key] for i, key in enumerate(num_cat_pair_1)} - self.num_cat_pair.update(num_cat_pair_1) - mark += len(pearsonr_high_1) - - mark_2 = 0 - tasks_2 = [] - for cat_col_index in self.high_level_cat_keys: - for num_col_index in range(self.n_time, self.n_time + self.n_num): - tasks_2.append((x[:, (num_col_index, cat_col_index)], - [y, num_col_index, cat_col_index, mark_2], - 'train_num_cat')) - mark_2 += 1 - - all_results = call_parallel(tasks_2) - - num_cat_pair_2 = {} - pearsonr_dict_2 = {} - for result in all_results: - if result[0][-1] > 0.001: - pearsonr_dict_2[result[1]] = result[0][-1] - num_cat_pair_2[result[1]] = result[0] - pearsonr_high_2 = sorted(pearsonr_dict_2, key=pearsonr_dict_2.get, reverse=True)[:self.feature_add_cat_num] - num_cat_pair_2 = {key: num_cat_pair_2[key] for key in pearsonr_high_2} - num_cat_pair_2 = {i + mark: num_cat_pair_2[key] for i, key in enumerate(num_cat_pair_2)} - self.num_cat_pair.update(num_cat_pair_2) - self.order_num_cat_pair = sorted(list(self.num_cat_pair.keys())) - print('num_cat_pair_2:', num_cat_pair_2) - - tasks = [] - for key in self.order_num_cat_pair: - if len(self.num_cat_pair[key]) == 1: - (col_index,) = self.num_cat_pair[key] - tasks.append((x[:, col_index], self.n_first_batch_keys[col_index], 'label')) - if len(self.num_cat_pair[key]) == 2: - (col_index, col_index) = self.num_cat_pair[key] - tasks.append((x[:, col_index], self.n_first_batch_keys[col_index], 'frequency')) - if len(self.num_cat_pair[key]) == 3: - (cat_col_index1, cat_col_index2, mu) = self.num_cat_pair[key] - tasks.append((x[:, (cat_col_index1, - cat_col_index2)], self.n_first_batch_keys[cat_col_index1], 'cat_cat')) - elif len(self.num_cat_pair[key]) == 4: - (num_col_index, cat_col_index, mu, a) = self.num_cat_pair[key] - tasks.append((x[:, (num_col_index, cat_col_index)], self.n_first_batch_keys[cat_col_index], 'num_cat')) - - results = call_parallel(tasks) - all_num = x.shape[1] - self.n_cat - results = [x[:, :all_num]] + results - ret = np.concatenate(results, axis=1) - - return ret + self.cat_col = None + self.num_col = None + self.time_col = None + self.pipeline = None def fit(self, raw_x, y, time_limit, data_info): """ @@ -264,6 +67,10 @@ def fit(self, raw_x, y, time_limit, data_info): self.n_cat = sum(self.data_info == 'CAT') self.total_samples = raw_x.shape[0] + self.cat_col = list(np.where(self.data_info == 'CAT')[0]) + self.num_col = list(np.where(self.data_info == 'NUM')[0]) + self.time_col = list(np.where(self.data_info == 'TIME')[0]) + print('#TIME features: {}'.format(self.n_time)) print('#NUM features: {}'.format(self.n_num)) print('#CAT features: {}'.format(self.n_cat)) @@ -271,54 +78,21 @@ def fit(self, raw_x, y, time_limit, data_info): # Convert sparse to dense if needed raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x - # convert to a dictionary of different datatype - raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'], - 'NUM': raw_x[:, self.data_info == 'NUM'], - 'CAT': raw_x[:, self.data_info == 'CAT']} - - # Init the label encoders for each categorical column - for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat): - self.cat_to_int_label[col_index] = {} - - x = self.extract_data(raw_x) - - d_size = x.shape[0] * x.shape[1] / self.budget - if d_size > 35000: - self.feature_add_high_cat = 0 - else: - self.feature_add_high_cat = 10 - - # Iterate cat features - for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat): - self.n_first_batch_keys[col_index] = len(self.cat_to_int_label[col_index]) - high_level_cat_keys_tmp = sorted(self.n_first_batch_keys, key=self.n_first_batch_keys.get, reverse=True)[ - :self.feature_add_high_cat] - for i in high_level_cat_keys_tmp: - if self.n_first_batch_keys[i] > 1e2: - self.high_level_cat_keys.append(i) - - # Convert NaN to zeros - x = np.nan_to_num(x) - - # Encode high-order categorical data to numerical with frequency - x = self.cat_to_num(x, y) - - # Standardize numeric columns - if self.n_num > 0: - self.scaler = StandardScaler() - x[:, :self.n_num] = self.scaler.fit_transform(x[:, :self.n_num]) - - # PCA for feature generation - self.pca = PCA(n_components=0.99, svd_solver='full') - x_pca = self.pca.fit_transform(x[:, :self.n_num]) - x = np.concatenate([x, x_pca], axis=1) - - x = self.process_time(x) - x = self.remove_useless(x) - - return x - - def encode(self, raw_x, time_limit=None): + self.pipeline = Pipeline([ + ('label_encoder', CatEncoder(selected_columns=self.cat_col)), + ('imputer', Imputation(selected_columns=self.cat_col + self.num_col + self.time_col)), + ('scaler', TabScaler(selected_columns=self.num_col)), + ('pca', TabPCA(selected_columns=self.num_col)), + ('time_diff', TimeDiff(selected_columns=self.time_col)), + ('filter', FilterConstant(selected_columns=self.time_col + self.num_col + self.cat_col)), + ('pearson_corr', FeatureFilter(selected_columns=self.time_col + self.num_col + self.cat_col)), + ('lgbm_feat_selection', FeatureImportance(selected_columns=self.time_col + self.num_col + self.cat_col)), + ]) + self.pipeline.fit(raw_x, y) + + return self + + def transform(self, raw_x, time_limit=None): """ This function should train the model parameters. @@ -340,28 +114,7 @@ def encode(self, raw_x, time_limit=None): # Convert sparse to dense if needed raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x - raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'], - 'NUM': raw_x[:, self.data_info == 'NUM'], - 'CAT': raw_x[:, self.data_info == 'CAT']} - x = self.extract_data(raw_x) - - # Convert NaN to zeros - x = np.nan_to_num(x) - - # Encode high-order categorical data to numerical with frequency - x = self.cat_to_num(x) - - x = self.process_time(x) - - if self.scaler: - x[:, :self.n_num] = self.scaler.transform(x[:, :self.n_num]) - if self.pca: - x_pca = self.pca.transform(x[:, :self.n_num]) - x = np.concatenate([x, x_pca], axis=1) - - if self.selected_cols is not None: - x = x[:, self.selected_cols] - return x + return self.pipeline.transform(raw_x) @staticmethod def extract_data_info(raw_x): @@ -380,3 +133,232 @@ def extract_data_info(raw_x): except: data_info.append('CAT') return np.array(data_info) + + +class TabularData: + def __init__(self, data, data_info): + self.data = data + self.data_info = data_info + + self.total_samples = 0 + + self.cat_to_int_label = {} + self.n_first_batch_keys = {} + self.high_level_cat_keys = [] + + self.num_cat_pair = {} + self.feature_add_high_cat = 0 + self.feature_add_cat_num = 10 + self.feature_add_cat_cat = 10 + self.order_num_cat_pair = {} + + self.selected_cols = None + + self.n_time = None + self.n_num = None + self.n_cat = None + + +class Primitive(BaseEstimator, TransformerMixin): + def __init__(self, selected_columns=[], selected_type=None): + self.selected = selected_columns + self.selected_type = selected_type + + @abstractmethod + def fit(self, X, y=None): + pass + + @abstractmethod + def transform(self, X, y=None): + pass + + +class TabScaler(Primitive): + scaler = None + + def fit(self, X, y=None): + self.scaler = StandardScaler() + self.scaler.fit(X[:, self.selected], y) + return self + + def transform(self, X, y=None): + X[:, self.selected] = self.scaler.transform(X[:, self.selected]) + return X + + +class CatEncoder(Primitive): + cat_to_int_label = {} + + def fit(self, X, y=None): + for col_index in self.selected: + self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) + for row_index in range(len(X)): + key = str(X[row_index, col_index]) + if key not in self.cat_to_int_label[col_index]: + self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index]) + return self + + def transform(self, X, y=None): + for col_index in self.selected: + for row_index in range(len(X)): + key = str(X[row_index, col_index]) + X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan) + return X + + +class FilterConstant(Primitive): + selected_cols = [] + + def fit(self, X, y=None): + self.selected_cols = np.where(np.max(X, axis=0) - np.min(X, axis=0) != 0)[0] + return self + + def transform(self, X, y=None): + return X[:, self.selected_cols] + + +class TimeDiff(Primitive): + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + x_time = X[:, self.selected] + len_cols = x_time.shape[1] + for i in range(len_cols): + for j in range(i + 1, len_cols): + X = np.append(X, np.expand_dims(x_time[:, i] - x_time[:, j], 1), 1) + return X + + +class TabPCA(Primitive): + pca = None + + def fit(self, X, y=None): + self.pca = PCA(n_components=0.99, svd_solver='full') + return self + + def transform(self, X, y=None): + x_pca = self.pca.fit_transform(X[:, self.selected]) + return np.concatenate([X, x_pca], axis=1) + + +class CatCount(Primitive): + count_dict = {} + + def fit(self, X, y=None): + for col in self.selected: + self.count_dict[col] = collections.Counter(X[:, col]) + return self + + def transform(self, X, y=None): + for col in self.selected: + gen_freq = np.vectorize(lambda key: self.count_dict[col][key]) + X[:, col] = gen_freq(X[:, col]) + return X + + +class LogTransform(Primitive): + + def fit(self, X, y=None): + return self + + def transform(self, X, y=None): + for col in self.selected: + X[:, col] = np.square(np.log(X[:, col])) + return X + + +class Imputation(Primitive): + imputer = None + + def fit(self, X, y=None): + self.imputer = SimpleImputer(strategy='most_frequent') + self.imputer.fit(X) + return self + + def transform(self, X, y=None): + return self.imputer.transform(X) + + +class FeatureFilter(Primitive): + def __init__(self, selected_columns=[], selected_type=None, threshold=0.001): + super().__init__(selected_columns, selected_type) + self.threshold = threshold + self.drop_columns = [] + + def fit(self, X, y=None): + for col in self.selected: + mu = abs(pearsonr(X[:, col], y)[0]) + if np.isnan(mu): + mu = 0 + if mu < self.threshold: + self.drop_columns.append(col) + return self + + def transform(self, X, y=None): + X = np.delete(X, self.drop_columns, axis=1) + return X + + +class FeatureImportance(Primitive): + def __init__(self, selected_columns=[], selected_type=None, threshold=0.001, task_type='classification'): + super().__init__(selected_columns, selected_type) + self.threshold = threshold + self.drop_columns = [] + self.task_type = task_type + + def fit(self, X, y=None): + if self.task_type == 'classification': + n_classes = len(set(y)) + if n_classes == 2: + estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + objective='binary') + else: + estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + num_class=n_classes, + objective='multiclass') + elif self.task_type == 'regression': + estimator = LGBMRegressor(silent=False, + verbose=-1, + n_jobs=1, + objective='regression') + estimator.fit(X, y) + feature_importance = estimator.feature_importances_ + feature_importance = feature_importance/feature_importance.mean() + self.drop_columns = np.where(feature_importance < self.threshold)[0] + return self + + def transform(self, X, y=None): + X = np.delete(X, self.drop_columns, axis=1) + return X + + +if __name__ == "__main__": + ntime, nnum, ncat = 4, 10, 8 + nsample = 1000 + x_num = np.random.random([nsample, nnum]) + x_time = np.random.random([nsample, ntime]) + x_cat = np.random.randint(0, 10, [nsample, ncat]) + + x_all = np.concatenate([x_num, x_time, x_cat], axis=1) + x_train = x_all[:int(nsample * 0.8), :] + x_test = x_all[int(nsample * 0.8):, :] + + y_all = np.random.randint(0, 2, nsample) + y_train = y_all[:int(nsample * 0.8)] + y_test = y_all[int(nsample * 0.8):] + + datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) + print(x_train[:4, 20]) + prep = TabularPreprocessor() + prep.fit(x_train, y_train, 24*60*60, datainfo) + x_new = prep.transform(x_train) + + print("-----") + print(x_new[:4, 2]) + From 1e7c94fb44cf455afbb999b93223c1ff75992363 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Tue, 3 Sep 2019 12:31:37 -0500 Subject: [PATCH 08/31] Fetch multiple trials from hyperopt --- autokaggle/estimators.py | 109 +++++++++++------- examples/tabular_classification_multiclass.py | 1 - 2 files changed, 65 insertions(+), 45 deletions(-) diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 0576430..b5dd7d9 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -10,13 +10,49 @@ from sklearn.model_selection import StratifiedKFold, KFold from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier -from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer from joblib import dump, load from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json import hyperopt -from hyperopt import tpe, hp, fmin, space_eval +from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK + +knn_classifier_params = {'n_neighbors': hp.choice('n_neighbors', range(2, 20)), + 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), + 'leaf_size': hp.choice('leaf_size', range(5, 50)), + 'metric': hp.choice('metric', ["euclidean", "manhattan", + "chebyshev", "minkowski" + ]), + 'p': hp.choice('p', range(1, 4)), + } +svc_params = {'C': hp.lognormal('C', 0, 1), + 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), + 'degree': hp.choice('degree', range(1, 6)), + 'gamma': hp.uniform('gamma', 0.001, 10000), + 'max_iter': 50000, + } + +random_forest_classifier_params = {'criterion': hp.choice('criterion', ['entropy', 'gini']), + 'max_features': hp.uniform('max_features', 0, 1.0), + 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), + 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))} + +lgbm_classifier_params = {'boosting_type': 'gbdt', + 'min_split_gain': 0.1, + 'subsample': 0.8, + 'num_leaves': 80, + 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), + 'max_depth': hp.choice('max_depth', range(5, 10)), + 'n_estimators': hp.choice('n_estimators', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate', 0, 1), + } + +adaboost_classifier_params = {'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), + } class TabularEstimator(BaseEstimator): @@ -31,6 +67,7 @@ def __init__(self, path=None, verbose=True, time_limit=None): abs_cwd = os.path.split(os.path.abspath(__file__))[0] self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams) self.best_estimator_ = None + self.ensemble_models = True def fit(self, x, y): self.init_model(y) @@ -54,25 +91,25 @@ def subsample(x, y, sample_percent): grid_train_x, grid_train_y = x[idx, :], y[idx] return grid_train_x, grid_train_y - def search(self, x, y, search_iter=40, folds=3): + def search(self, x, y, search_iter=4, folds=3): grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1) score_metric, skf = self.get_skf(folds) - self.hparams = space = hp.choice('classifier', [ + self.hparams = hp.choice('classifier', [ {'model': KNeighborsClassifier, - 'param': {'n_neighbors': - hp.choice('n_neighbors', range(3, 11)), - 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree']), - 'leaf_size': hp.choice('leaf_size', range(1, 50)), - 'metric': hp.choice('metric', ["euclidean", "manhattan", - "chebyshev", "minkowski" - ])} + 'param': knn_classifier_params }, {'model': SVC, - 'param': {'C': hp.lognormal('C', 0, 1), - 'kernel': hp.choice('kernel', ['rbf', 'poly', 'rbf', 'sigmoid']), - 'degree': hp.choice('degree', range(1, 15)), - 'gamma': hp.uniform('gamma', 0.001, 10000)} + 'param': svc_params + }, + {'model': RandomForestClassifier, + 'param': random_forest_classifier_params + }, + {'model': LGBMClassifier, + 'param': lgbm_classifier_params + }, + {'model': AdaBoostClassifier, + 'param': adaboost_classifier_params } ]) @@ -81,39 +118,23 @@ def objective_func(args): loss = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() print("CV Score:", loss) print("\n=================") - return 1 - loss + return {'loss': 1 - loss, 'status': STATUS_OK, 'space': args} - opt = space_eval(self.hparams, fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, + trials = Trials() + opt = space_eval(self.hparams, fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, max_evals=search_iter)) - self.best_estimator_ = opt['model'](**opt['param']) + if self.ensemble_models: + best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False) + estimator_list = [] + for i in range(2): + model_params = best_trials[i]['space'] + est = model_params['model'](**model_params['param']) + estimator_list.append(est) + self.best_estimator_ = Ensembler(x, y, estimator_list) + else: + self.best_estimator_ = opt['model'](**opt['param']) return opt - - # def search(self, x, y, search_iter=40, folds=3): - # grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1) - # - # if type(self.hparams) != list: - # self.hparams = [self.hparams] - # - # best_params = {} - # for idx, search_space in enumerate(self.hparams): - # best_params.update(search_space) - # if self.verbose: - # print("Step: {}".format(idx+1)) - # print("Search space:") - # print(best_params) - # score_metric, skf = self.get_skf(folds) - # random_search = RandomizedSearchCV(self.estimator, param_distributions=best_params, n_iter=search_iter, - # scoring=score_metric, - # n_jobs=1, cv=skf, verbose=0, random_state=1001, iid=False) - # random_search.fit(grid_train_x, grid_train_y) - # best_params = random_search.best_params_ - # for key, value in best_params.items(): - # best_params[key] = [value] - # - # self.best_estimator_ = random_search.best_estimator_ - # - # return random_search.best_params_ @abstractmethod def save_model(self): diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py index 13612f6..0da02fe 100644 --- a/examples/tabular_classification_multiclass.py +++ b/examples/tabular_classification_multiclass.py @@ -3,7 +3,6 @@ sys.path.append("..") # print(sys.path) from autokaggle import * - if __name__ == '__main__': ntime, nnum, ncat = 4, 10, 8 nsample = 10000 From c5158ceb0868978e1c31e443d8f2bd8b9c547e96 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Tue, 3 Sep 2019 17:15:02 -0500 Subject: [PATCH 09/31] Fix regression hparamspace --- autokaggle/__init__.py | 3 +- autokaggle/auto_ml.py | 2 +- autokaggle/ensemblers.py | 57 +++++++-- autokaggle/estimators.py | 228 ++++++++++++++++----------------- examples/benchmarking.py | 6 +- examples/tabular_regression.py | 2 +- 6 files changed, 170 insertions(+), 128 deletions(-) diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py index 3961f6d..0245d5b 100644 --- a/autokaggle/__init__.py +++ b/autokaggle/__init__.py @@ -1,2 +1,3 @@ from autokaggle.auto_ml import AutoKaggle -from autokaggle.estimators import * \ No newline at end of file +from autokaggle.estimators import * +from autokaggle.ensemblers import * diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index f77e004..7cfcbff 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -13,7 +13,7 @@ class AutoKaggle(BaseEstimator): - def __init__(self, estimator=LgbmClassifier(), preprocessor=TabularPreprocessor(), path=None, verbose=True): + def __init__(self, estimator=Classifier(), preprocessor=TabularPreprocessor(), path=None, verbose=True): """ Initialization function for tabular supervised learner. """ diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index 532072e..e931954 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -12,26 +12,65 @@ from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error from joblib import dump, load +from scipy import stats +from lightgbm import LGBMClassifier, LGBMRegressor class RankedEnsembler: - def __init__(self, ensemble_method='max_voting'): + def __init__(self, estimator_list, ensemble_method='max_voting'): self.ensemble_method = ensemble_method + self.estimators = estimator_list - def fit(self, predictions, y_true): - pass + def fit(self, X, y): + for est in self.estimators: + est.fit(X, y) - def predict(self, predictions): + def predict(self, X): + predictions = np.zeros((len(X), len(self.estimators))) + for i, est in enumerate(self.estimators): + predictions[:, i] = est.predict(X) + if self.ensemble_method == 'median': - return predictions.apply(np.median, axis=1).values + return np.median(predictions, axis=1) elif self.ensemble_method == 'mean': - return predictions.apply(np.average, axis=1).values + return np.mean(predictions, axis=1) elif self.ensemble_method == 'max': - return predictions.apply(np.max, axis=1).values + return np.max(predictions, axis=1) elif self.ensemble_method == 'min': - return predictions.apply(np.min, axis=1).values + return np.min(predictions, axis=1) elif self.ensemble_method == 'max_voting': - return predictions.apply(mode, axis=1).values + return stats.mode(predictions, axis=1)[0] + + +class StackingEnsembler: + def __init__(self, estimator_list, objective): + self.estimator_list = estimator_list + self.objective = objective + if self.objective == 'regression': + self.stacking_estimator = LGBMRegressor(silent=False, + verbose=-1, + n_jobs=1, + objective=self.objective) + elif self.objective == 'multiclass' or self.objective == 'binary': + self.stacking_estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + objective=self.objective) + + def fit(self, X, y): + for est in self.estimator_list: + est.fit(X, y) + predictions = np.zeros((len(X), len(self.estimator_list))) + for i, est in enumerate(self.estimator_list): + predictions[:, i] = est.predict(X) + self.stacking_estimator.fit(predictions, y) + + def predict(self, X): + predictions = np.zeros((len(X), len(self.estimator_list))) + for i, est in enumerate(self.estimator_list): + predictions[:, i] = est.predict(X) + return self.stacking_estimator.predict(predictions) + class EnsembleSelection: def __init__(self, ensemble_size=25): diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index b5dd7d9..a133002 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -10,49 +10,88 @@ from sklearn.model_selection import StratifiedKFold, KFold from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor,\ + ExtraTreesRegressor +from sklearn.linear_model import Ridge from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer from joblib import dump, load from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler import hyperopt from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK knn_classifier_params = {'n_neighbors': hp.choice('n_neighbors', range(2, 20)), - 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), - 'leaf_size': hp.choice('leaf_size', range(5, 50)), - 'metric': hp.choice('metric', ["euclidean", "manhattan", - "chebyshev", "minkowski" - ]), - 'p': hp.choice('p', range(1, 4)), - } + 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), + 'leaf_size': hp.choice('leaf_size', range(5, 50)), + 'metric': hp.choice('metric', ["euclidean", "manhattan", + "chebyshev", "minkowski" + ]), + 'p': hp.choice('p', range(1, 4)), + } + svc_params = {'C': hp.lognormal('C', 0, 1), - 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), - 'degree': hp.choice('degree', range(1, 6)), - 'gamma': hp.uniform('gamma', 0.001, 10000), - 'max_iter': 50000, - } + 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), + 'degree': hp.choice('degree', range(1, 6)), + 'gamma': hp.uniform('gamma', 0.001, 10000), + 'max_iter': 50000, + } random_forest_classifier_params = {'criterion': hp.choice('criterion', ['entropy', 'gini']), - 'max_features': hp.uniform('max_features', 0, 1.0), - 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), - 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))} + 'max_features': hp.uniform('max_features', 0, 1.0), + 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), + 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))} lgbm_classifier_params = {'boosting_type': 'gbdt', - 'min_split_gain': 0.1, - 'subsample': 0.8, - 'num_leaves': 80, - 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), - 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), - 'max_depth': hp.choice('max_depth', range(5, 10)), - 'n_estimators': hp.choice('n_estimators', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate', 0, 1), - } + 'min_split_gain': 0.1, + 'subsample': 0.8, + 'num_leaves': 80, + 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), + 'max_depth': hp.choice('max_depth', range(5, 10)), + 'n_estimators': hp.choice('n_estimators', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate', 0, 1), + } adaboost_classifier_params = {'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), - 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), - } + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), + } + +extra_trees_regressor_params = { + 'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)), + 'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']), + 'max_features': hp.uniform('max_features_extra_trees', 0, 1.0), + 'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)), + 'min_impurity_decrease': 0.0 +} +ridge_params = { + 'fit_intercept': True, + 'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1), + 'alpha': hp.loguniform('alpha_ridge', 1e-5, 10) +} +random_forest_regressor_params = { + 'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']), + 'max_features': hp.uniform('max_features', 0, 1.0), + 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), + 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) +} +lgbm_regressor_params = { + 'boosting_type': 'gbdt', + 'min_split_gain': 0.1, + 'subsample': 0.8, + 'num_leaves': 80, + 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), + 'max_depth': hp.choice('max_depth', range(5, 10)), + 'n_estimators': hp.choice('n_estimators', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate', 0, 1), +} +adaboost_regressor_params = { + 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), +} class TabularEstimator(BaseEstimator): @@ -65,19 +104,21 @@ def __init__(self, path=None, verbose=True, time_limit=None): self.time_limit = time_limit self.objective = None abs_cwd = os.path.split(os.path.abspath(__file__))[0] - self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams) + # self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams) self.best_estimator_ = None self.ensemble_models = True def fit(self, x, y): - self.init_model(y) + if self.objective == 'classification': + n_classes = len(set(y)) + self.objective = 'binary' if n_classes == 2 else 'multiclass' self.search(x, y) self.best_estimator_.fit(x, y) self.save_model() def predict(self, x, y=None): - y = self.best_estimator_.predict(x, ) - return y + y_pred = self.best_estimator_.predict(x, ) + return y_pred @staticmethod def subsample(x, y, sample_percent): @@ -95,24 +136,6 @@ def search(self, x, y, search_iter=4, folds=3): grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1) score_metric, skf = self.get_skf(folds) - self.hparams = hp.choice('classifier', [ - {'model': KNeighborsClassifier, - 'param': knn_classifier_params - }, - {'model': SVC, - 'param': svc_params - }, - {'model': RandomForestClassifier, - 'param': random_forest_classifier_params - }, - {'model': LGBMClassifier, - 'param': lgbm_classifier_params - }, - {'model': AdaBoostClassifier, - 'param': adaboost_classifier_params - } - ]) - def objective_func(args): clf = args['model'](**args['param']) loss = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() @@ -121,8 +144,7 @@ def objective_func(args): return {'loss': 1 - loss, 'status': STATUS_OK, 'space': args} trials = Trials() - opt = space_eval(self.hparams, fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, - max_evals=search_iter)) + best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, max_evals=search_iter) if self.ensemble_models: best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False) estimator_list = [] @@ -130,35 +152,45 @@ def objective_func(args): model_params = best_trials[i]['space'] est = model_params['model'](**model_params['param']) estimator_list.append(est) - self.best_estimator_ = Ensembler(x, y, estimator_list) + # self.best_estimator_ = RankedEnsembler(estimator_list, ensemble_method='max_voting') + self.best_estimator_ = StackingEnsembler(estimator_list, objective=self.objective) else: + opt = space_eval(self.hparams, best) self.best_estimator_ = opt['model'](**opt['param']) - - return opt @abstractmethod def save_model(self): pass - @abstractmethod - def init_model(self, y): - pass - @abstractmethod def get_skf(self, folds): pass - def __repr__(self): - return "TabularEstimator model" - class Classifier(TabularEstimator): """Classifier class. - It is used for tabular data classification with lightgbm classifier. + It is used for tabular data classification. """ def __init__(self, path=None, verbose=True, time_limit=None): super().__init__(path, verbose, time_limit) self.objective = 'classification' + self.hparams = hp.choice('classifier', [ + {'model': KNeighborsClassifier, + 'param': knn_classifier_params + }, + {'model': SVC, + 'param': svc_params + }, + {'model': RandomForestClassifier, + 'param': random_forest_classifier_params + }, + {'model': LGBMClassifier, + 'param': lgbm_classifier_params + }, + {'model': AdaBoostClassifier, + 'param': adaboost_classifier_params + } + ]) def get_skf(self, folds): if self.objective == 'binary': @@ -172,75 +204,41 @@ def get_skf(self, folds): class Regressor(TabularEstimator): """Regressor class. - It is used for tabular data regression with lightgbm regressor. + It is used for tabular data regression. """ def __init__(self, path=None, verbose=True, time_limit=None): super().__init__(path, verbose, time_limit) self.objective = 'regression' + self.hparams = hp.choice('regressor', [ + {'model': ExtraTreesRegressor, + 'param': extra_trees_regressor_params + }, + {'model': Ridge, + 'param': ridge_params + }, + {'model': RandomForestRegressor, + 'param': random_forest_regressor_params + }, + {'model': LGBMRegressor, + 'param': lgbm_regressor_params + }, + {'model': AdaBoostRegressor, + 'param': adaboost_regressor_params + } + ]) def get_skf(self, folds): return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001) class LGBMMixIn: - _default_hyperparams = "lgbm_hp.json" - def save_model(self): self.best_estimator_.booster_.save_model(self.save_filename) - - def get_feature_importance(self): - if self.best_estimator_: - print('Feature Importance:') - print(self.best_estimator_.feature_importances_) class SklearnMixIn: - def save_model(self): dump(self.best_estimator_, self.save_filename) def load_model(self): self.best_estimator_ = load(self.save_filename) - - -class SVMClassifier(Classifier, SklearnMixIn): - _default_hyperparams = "svm_hp.json" - - def init_model(self, y): - n_classes = len(set(y)) - self.objective = 'binary' if n_classes == 2 else 'multiclass' - self.estimator = SVC() - - -class RFClassifier(Classifier, SklearnMixIn): - _default_hyperparams = "rf_hp.json" - - def init_model(self, y): - n_classes = len(set(y)) - self.objective = 'binary' if n_classes == 2 else 'multiclass' - self.estimator = RandomForestClassifier() - -class LgbmClassifier(Classifier, LGBMMixIn): - def init_model(self, y): - n_classes = len(set(y)) - if n_classes == 2: - self.objective = 'binary' - self.estimator = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) - else: - self.objective = 'multiclass' - self.estimator = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - num_class=n_classes, - objective=self.objective) - - -class LgbmRegressor(Regressor, LGBMMixIn): - def init_model(self, y): - self.estimator = LGBMRegressor(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) \ No newline at end of file diff --git a/examples/benchmarking.py b/examples/benchmarking.py index c62b1dc..a5e7d69 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -1,3 +1,5 @@ +import string +import random import sys sys.path.append("../") import numpy as np @@ -15,10 +17,12 @@ import warnings from abc import abstractmethod + def generate_rand_string(size): chars = string.ascii_uppercase + string.digits return ''.join(random.choice(chars) for _ in range(size)) + class BenchmarkingBase: results = None cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", "balanced_accuracy", "F1_score", "AUC"] @@ -118,7 +122,7 @@ def evaluate(self, task_id, time_limit=10*60): if task.task_type == 'Supervised Classification': automl = AutoKaggle() if not self.estimator_type else AutoKaggle(self.estimator_type) elif task.task_type == 'Supervised Regression': - automl = AutoKaggle(LgbmRegressor) if not self.estimator_type else AutoKaggle(self.estimator_type) + automl = AutoKaggle(Regressor()) if not self.estimator_type else AutoKaggle(self.estimator_type) else: print("UNSUPPORTED TASK_TYPE") assert(0) diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index ccb2064..c338dd2 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -18,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggle(LgbmRegressor()) + clf = AutoKaggle(Regressor()) datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) From 6dd78ae12e5bd2fa19d1ac6dc5fb93324c1b2407 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Wed, 4 Sep 2019 10:46:37 -0500 Subject: [PATCH 10/31] Resolve fziling datasets --- autokaggle/estimators.py | 56 +++++++++-------- autokaggle/preprocessor.py | 125 +++++++++++++++++++++++++++++-------- examples/benchmarking.py | 102 ++++++++++++++++-------------- 3 files changed, 185 insertions(+), 98 deletions(-) diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index a133002..3209710 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -21,6 +21,8 @@ import hyperopt from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK + +# TODO: Way to change the default hparams knn_classifier_params = {'n_neighbors': hp.choice('n_neighbors', range(2, 20)), 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), 'leaf_size': hp.choice('leaf_size', range(5, 50)), @@ -104,15 +106,18 @@ def __init__(self, path=None, verbose=True, time_limit=None): self.time_limit = time_limit self.objective = None abs_cwd = os.path.split(os.path.abspath(__file__))[0] - # self.hparams = read_json(abs_cwd + "/hparam_space/" + self._default_hyperparams) self.best_estimator_ = None - self.ensemble_models = True + self.use_ensembling = False + self.hparams = None + self.num_estimators_ensemble = 2 + self.ensemble_strategy = 'ranked_ensembling' + self.ensemble_method = 'max_voting' def fit(self, x, y): if self.objective == 'classification': n_classes = len(set(y)) self.objective = 'binary' if n_classes == 2 else 'multiclass' - self.search(x, y) + self.best_estimator_, _ = self.search(x, y) self.best_estimator_.fit(x, y) self.save_model() @@ -122,6 +127,7 @@ def predict(self, x, y=None): @staticmethod def subsample(x, y, sample_percent): + # TODO: Add way to balance the subsample # Set small sample for hyper-param search if x.shape[0] > 600: grid_train_percentage = max(600.0 / x.shape[0], sample_percent) @@ -132,31 +138,37 @@ def subsample(x, y, sample_percent): grid_train_x, grid_train_y = x[idx, :], y[idx] return grid_train_x, grid_train_y - def search(self, x, y, search_iter=4, folds=3): - grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=0.1) + def search(self, x, y, search_iter=4, folds=3, sample_percent=0.1): + grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=sample_percent) score_metric, skf = self.get_skf(folds) def objective_func(args): clf = args['model'](**args['param']) - loss = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() - print("CV Score:", loss) - print("\n=================") - return {'loss': 1 - loss, 'status': STATUS_OK, 'space': args} + eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() + if self.verbose: + print("CV Score:", eval_score) + print("\n=================") + return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} trials = Trials() best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, max_evals=search_iter) - if self.ensemble_models: + if self.use_ensembling: best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False) estimator_list = [] - for i in range(2): + for i in range(self.num_estimators_ensemble): model_params = best_trials[i]['space'] est = model_params['model'](**model_params['param']) estimator_list.append(est) - # self.best_estimator_ = RankedEnsembler(estimator_list, ensemble_method='max_voting') - self.best_estimator_ = StackingEnsembler(estimator_list, objective=self.objective) + if self.ensemble_strategy == 'ranked_ensembling': + best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.ensemble_method) + elif self.ensemble_strategy == 'stacking': + best_estimator_ = StackingEnsembler(estimator_list, objective=self.objective) + else: + best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.ensemble_method) else: opt = space_eval(self.hparams, best) - self.best_estimator_ = opt['model'](**opt['param']) + best_estimator_ = opt['model'](**opt['param']) + return best_estimator_, trials @abstractmethod def save_model(self): @@ -174,6 +186,7 @@ class Classifier(TabularEstimator): def __init__(self, path=None, verbose=True, time_limit=None): super().__init__(path, verbose, time_limit) self.objective = 'classification' + # TODO: add choice to the set of estimators self.hparams = hp.choice('classifier', [ {'model': KNeighborsClassifier, 'param': knn_classifier_params @@ -209,6 +222,7 @@ class Regressor(TabularEstimator): def __init__(self, path=None, verbose=True, time_limit=None): super().__init__(path, verbose, time_limit) self.objective = 'regression' + # TODO: add choice to the set of estimators self.hparams = hp.choice('regressor', [ {'model': ExtraTreesRegressor, 'param': extra_trees_regressor_params @@ -229,16 +243,4 @@ def __init__(self, path=None, verbose=True, time_limit=None): def get_skf(self, folds): return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001) - - -class LGBMMixIn: - def save_model(self): - self.best_estimator_.booster_.save_model(self.save_filename) - - -class SklearnMixIn: - def save_model(self): - dump(self.best_estimator_, self.save_filename) - - def load_model(self): - self.best_estimator_ = load(self.save_filename) + diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index f155a73..84a9460 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -3,7 +3,7 @@ import scipy from scipy.stats import pearsonr from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer from sklearn.base import TransformerMixin from sklearn.base import BaseEstimator from sklearn.impute import SimpleImputer @@ -164,32 +164,73 @@ def __init__(self, selected_columns=[], selected_type=None): self.selected = selected_columns self.selected_type = selected_type - @abstractmethod def fit(self, X, y=None): + if not self.selected: + return self + return self._fit(X, y) + + def transform(self, X, y=None): + if not self.selected: + return X + return self._transform(X, y) + + @abstractmethod + def _fit(self, X, y=None): pass @abstractmethod - def transform(self, X, y=None): + def _transform(self, X, y=None): pass class TabScaler(Primitive): scaler = None - def fit(self, X, y=None): + def _fit(self, X, y=None): self.scaler = StandardScaler() self.scaler.fit(X[:, self.selected], y) return self - def transform(self, X, y=None): + def _transform(self, X, y=None): X[:, self.selected] = self.scaler.transform(X[:, self.selected]) return X +class BoxCox(Primitive): + transformer = None + + def _fit(self, X, y=None): + self.transformer = PowerTransformer() + self.transformer.fit(X[:, self.selected], y) + return self + + def _transform(self, X, y=None): + X[:, self.selected] = self.transformer.transform(X[:, self.selected]) + return X + + +class Binning(Primitive): + binner = None + + def __init__(self, selected_columns=[], selected_type=None, strategy='quantile', encoding='ordinal'): + super().__init__(selected_columns, selected_type) + self.strategy = strategy + self.encoding = encoding + + def _fit(self, X, y=None): + self.binner = KBinsDiscretizer(strategy=self.strategy, encode=self.encoding) + self.binner.fit(X[:, self.selected], y) + return self + + def _transform(self, X, y=None): + X[:, self.selected] = self.binner.transform(X[:, self.selected]) + return X + + class CatEncoder(Primitive): cat_to_int_label = {} - def fit(self, X, y=None): + def _fit(self, X, y=None): for col_index in self.selected: self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) for row_index in range(len(X)): @@ -198,7 +239,27 @@ def fit(self, X, y=None): self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index]) return self - def transform(self, X, y=None): + def _transform(self, X, y=None): + for col_index in self.selected: + for row_index in range(len(X)): + key = str(X[row_index, col_index]) + X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan) + return X + + +class TargetEncoder(Primitive): + cat_to_int_label = {} + + def _fit(self, X, y=None): + for col_index in self.selected: + self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) + for row_index in range(len(X)): + key = str(X[row_index, col_index]) + if key not in self.cat_to_int_label[col_index]: + self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index]) + return self + + def _transform(self, X, y=None): for col_index in self.selected: for row_index in range(len(X)): key = str(X[row_index, col_index]) @@ -209,20 +270,20 @@ def transform(self, X, y=None): class FilterConstant(Primitive): selected_cols = [] - def fit(self, X, y=None): + def _fit(self, X, y=None): self.selected_cols = np.where(np.max(X, axis=0) - np.min(X, axis=0) != 0)[0] return self - def transform(self, X, y=None): + def _transform(self, X, y=None): return X[:, self.selected_cols] class TimeDiff(Primitive): - def fit(self, X, y=None): + def _fit(self, X, y=None): return self - def transform(self, X, y=None): + def _transform(self, X, y=None): x_time = X[:, self.selected] len_cols = x_time.shape[1] for i in range(len_cols): @@ -231,27 +292,40 @@ def transform(self, X, y=None): return X +class TimeOffset(Primitive): + start_time = None + + def _fit(self, X, y=None): + self.start_time = np.min(X[self.selected], axis=0) + return self + + def _transform(self, X, y=None): + X[self.selected] = X[self.selected] - self.start_time + return X + + class TabPCA(Primitive): pca = None - def fit(self, X, y=None): + def _fit(self, X, y=None): self.pca = PCA(n_components=0.99, svd_solver='full') + self.pca.fit(X[:, self.selected]) return self - def transform(self, X, y=None): - x_pca = self.pca.fit_transform(X[:, self.selected]) + def _transform(self, X, y=None): + x_pca = self.pca.transform(X[:, self.selected]) return np.concatenate([X, x_pca], axis=1) class CatCount(Primitive): count_dict = {} - def fit(self, X, y=None): + def _fit(self, X, y=None): for col in self.selected: self.count_dict[col] = collections.Counter(X[:, col]) return self - def transform(self, X, y=None): + def _transform(self, X, y=None): for col in self.selected: gen_freq = np.vectorize(lambda key: self.count_dict[col][key]) X[:, col] = gen_freq(X[:, col]) @@ -260,10 +334,10 @@ def transform(self, X, y=None): class LogTransform(Primitive): - def fit(self, X, y=None): + def _fit(self, X, y=None): return self - def transform(self, X, y=None): + def _transform(self, X, y=None): for col in self.selected: X[:, col] = np.square(np.log(X[:, col])) return X @@ -272,12 +346,13 @@ def transform(self, X, y=None): class Imputation(Primitive): imputer = None - def fit(self, X, y=None): - self.imputer = SimpleImputer(strategy='most_frequent') + def _fit(self, X, y=None): + # TODO implement most_frequent + self.imputer = SimpleImputer(strategy='constant', fill_value=0) self.imputer.fit(X) return self - def transform(self, X, y=None): + def _transform(self, X, y=None): return self.imputer.transform(X) @@ -287,7 +362,7 @@ def __init__(self, selected_columns=[], selected_type=None, threshold=0.001): self.threshold = threshold self.drop_columns = [] - def fit(self, X, y=None): + def _fit(self, X, y=None): for col in self.selected: mu = abs(pearsonr(X[:, col], y)[0]) if np.isnan(mu): @@ -296,7 +371,7 @@ def fit(self, X, y=None): self.drop_columns.append(col) return self - def transform(self, X, y=None): + def _transform(self, X, y=None): X = np.delete(X, self.drop_columns, axis=1) return X @@ -308,7 +383,7 @@ def __init__(self, selected_columns=[], selected_type=None, threshold=0.001, tas self.drop_columns = [] self.task_type = task_type - def fit(self, X, y=None): + def _fit(self, X, y=None): if self.task_type == 'classification': n_classes = len(set(y)) if n_classes == 2: @@ -333,7 +408,7 @@ def fit(self, X, y=None): self.drop_columns = np.where(feature_importance < self.threshold)[0] return self - def transform(self, X, y=None): + def _transform(self, X, y=None): X = np.delete(X, self.drop_columns, axis=1) return X diff --git a/examples/benchmarking.py b/examples/benchmarking.py index a5e7d69..4ff1a59 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -8,8 +8,8 @@ import sklearn.datasets from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, balanced_accuracy_score,\ mean_absolute_error, mean_squared_error -from autosklearn.regression import AutoSklearnRegressor -from autosklearn.classification import AutoSklearnClassifier +# from autosklearn.regression import AutoSklearnRegressor +# from autosklearn.classification import AutoSklearnClassifier from autokaggle import * from autokaggle.utils import * import openml @@ -142,50 +142,50 @@ def evaluate(self, task_id, time_limit=10*60): print(result) return result - -class BenchmarkingAutoSklearn(BenchmarkingBase): - def get_data_info(self, categorical_indicator): - return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator] - - def evaluate(self, task_id, time_limit=10*60): - task_info = ["autosklearn", task_id, time_limit] - task = openml.tasks.get_task(task_id) - train_indices, test_indices = task.get_train_test_split_indices() - dataset = task.get_dataset() - X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array') - - x_train, y_train = X[train_indices], y[train_indices] - x_test, y_test = X[test_indices], y[test_indices] - - # Create feature type list from openml.org indicator - feat_type = self.get_data_info(categorical_indicator) - - # Train - if task.task_type == 'Supervised Classification': - automl = AutoSklearnClassifier( - time_left_for_this_task=time_limit, - per_run_time_limit=time_limit//10, **kwargs) - elif task.task_type == 'Supervised Regression': - automl = AutoSklearnRegressor( - time_left_for_this_task=time_limit, - per_run_time_limit=time_limit//10, **kwargs) - else: - print("UNSUPPORTED TASK_TYPE") - assert(0) - - automl.fit(x_train, y_train, feat_type=feat_type) - - y_hat = automl.predict(x_test) - if task.task_type == 'Supervised Classification': - is_binary = True if len(task.class_labels) <= 2 else False - result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary) - self.cls_results.loc[len(self.cls_results)] = result - elif task.task_type == 'Supervised Regression': - result = task_info + self.measure_performance_rgs(y_test, y_hat) - self.rgs_results.loc[len(self.rgs_results)] = result - self.results.append(result) - print(result) - return result +# +# class BenchmarkingAutoSklearn(BenchmarkingBase): +# def get_data_info(self, categorical_indicator): +# return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator] +# +# def evaluate(self, task_id, time_limit=10*60): +# task_info = ["autosklearn", task_id, time_limit] +# task = openml.tasks.get_task(task_id) +# train_indices, test_indices = task.get_train_test_split_indices() +# dataset = task.get_dataset() +# X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array') +# +# x_train, y_train = X[train_indices], y[train_indices] +# x_test, y_test = X[test_indices], y[test_indices] +# +# # Create feature type list from openml.org indicator +# feat_type = self.get_data_info(categorical_indicator) +# +# # Train +# if task.task_type == 'Supervised Classification': +# automl = AutoSklearnClassifier( +# time_left_for_this_task=time_limit, +# per_run_time_limit=time_limit//10, **kwargs) +# elif task.task_type == 'Supervised Regression': +# automl = AutoSklearnRegressor( +# time_left_for_this_task=time_limit, +# per_run_time_limit=time_limit//10, **kwargs) +# else: +# print("UNSUPPORTED TASK_TYPE") +# assert(0) +# +# automl.fit(x_train, y_train, feat_type=feat_type) +# +# y_hat = automl.predict(x_test) +# if task.task_type == 'Supervised Classification': +# is_binary = True if len(task.class_labels) <= 2 else False +# result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary) +# self.cls_results.loc[len(self.cls_results)] = result +# elif task.task_type == 'Supervised Regression': +# result = task_info + self.measure_performance_rgs(y_test, y_hat) +# self.rgs_results.loc[len(self.rgs_results)] = result +# self.results.append(result) +# print(result) +# return result def get_dataset_ids(task_ids): if type(task_ids) == list: @@ -208,6 +208,7 @@ def get_task_info(task_ids): dataset_list.append(dataset) return dataset_list, task_types + def get_dataset_properties(task_ids): dataset_list, task_types = get_task_info(task_ids) df = pd.DataFrame(columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", "#Date"]) @@ -274,3 +275,12 @@ def get_box_plot(data, task_id, metric): auto_kaggle.append(med_ak) temp = pd.DataFrame(data={"Autokaggle":auto_kaggle, "AutoSklearn":auto_sklearn}) temp.boxplot() + + +if __name__ == "__main__": + regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839] + classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101, + 9914, 3020, 3524, 3573, 3962] + ak = BenchmarkingAutoKaggle() + # ak.run_automation(classification_task_list) + ak.evaluate(24) \ No newline at end of file From d6a606087fba2a702188ec673883751e32b37d2f Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Wed, 4 Sep 2019 12:40:44 -0500 Subject: [PATCH 11/31] Before shifting to pandas --- autokaggle/estimators.py | 6 ++++++ autokaggle/preprocessor.py | 36 ++++++++++++++++++------------------ examples/benchmarking.py | 6 +++++- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 3209710..dccff50 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -20,6 +20,7 @@ from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler import hyperopt from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK +from imblearn.over_sampling import SMOTE, SMOTENC # TODO: Way to change the default hparams @@ -117,6 +118,7 @@ def fit(self, x, y): if self.objective == 'classification': n_classes = len(set(y)) self.objective = 'binary' if n_classes == 2 else 'multiclass' + x, y = self.resample(x, y) self.best_estimator_, _ = self.search(x, y) self.best_estimator_.fit(x, y) self.save_model() @@ -125,6 +127,10 @@ def predict(self, x, y=None): y_pred = self.best_estimator_.predict(x, ) return y_pred + @staticmethod + def resample(X, y): + return SMOTE(sampling_strategy='auto').fit_resample(X, y) + @staticmethod def subsample(x, y, sample_percent): # TODO: Add way to balance the subsample diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 84a9460..55943a9 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -247,24 +247,24 @@ def _transform(self, X, y=None): return X -class TargetEncoder(Primitive): - cat_to_int_label = {} - - def _fit(self, X, y=None): - for col_index in self.selected: - self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) - for row_index in range(len(X)): - key = str(X[row_index, col_index]) - if key not in self.cat_to_int_label[col_index]: - self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index]) - return self - - def _transform(self, X, y=None): - for col_index in self.selected: - for row_index in range(len(X)): - key = str(X[row_index, col_index]) - X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan) - return X +# class TargetEncoder(Primitive): +# cat_to_int_label = {} +# +# def _fit(self, X, y=None): +# for col_index in self.selected: +# self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) +# for row_index in range(len(X)): +# key = str(X[row_index, col_index]) +# if key not in self.cat_to_int_label[col_index]: +# self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index]) +# return self +# +# def _transform(self, X, y=None): +# for col_index in self.selected: +# for row_index in range(len(X)): +# key = str(X[row_index, col_index]) +# X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan) +# return X class FilterConstant(Primitive): diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 4ff1a59..30d8dec 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -283,4 +283,8 @@ def get_box_plot(data, task_id, metric): 9914, 3020, 3524, 3573, 3962] ak = BenchmarkingAutoKaggle() # ak.run_automation(classification_task_list) - ak.evaluate(24) \ No newline at end of file + import time + t1 = time.time() + ak.evaluate(45) + t2 = time.time() + print(t2-t1) \ No newline at end of file From 8d0dc6bd22a553ccdd6083a6941bbef3336016c2 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Wed, 4 Sep 2019 19:07:26 -0500 Subject: [PATCH 12/31] Shift to pandas, add 2nd order and target encoding --- autokaggle/preprocessor.py | 237 ++++++++++++++++++++++++++++--------- examples/benchmarking.py | 4 +- 2 files changed, 184 insertions(+), 57 deletions(-) diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 55943a9..bead35b 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -1,6 +1,8 @@ import numpy as np -from pandas import DataFrame +import pandas as pd import scipy +import math +import itertools from scipy.stats import pearsonr from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer @@ -70,6 +72,9 @@ def fit(self, raw_x, y, time_limit, data_info): self.cat_col = list(np.where(self.data_info == 'CAT')[0]) self.num_col = list(np.where(self.data_info == 'NUM')[0]) self.time_col = list(np.where(self.data_info == 'TIME')[0]) + self.cat_col = [str(i) for i in self.cat_col] + self.num_col = [str(i) for i in self.num_col] + self.time_col = [str(i) for i in self.time_col] print('#TIME features: {}'.format(self.n_time)) print('#NUM features: {}'.format(self.n_num)) @@ -78,12 +83,21 @@ def fit(self, raw_x, y, time_limit, data_info): # Convert sparse to dense if needed raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x + # To pandas + raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) + self.pipeline = Pipeline([ - ('label_encoder', CatEncoder(selected_columns=self.cat_col)), + # ('cat_num_encoder', CatNumEncoder(selected_columns=self.cat_col, selected_num=self.num_col)), + ('cat_encoder', TargetEncoder(selected_columns=self.cat_col)), + # ('cat_cat_encoder', CatCatEncoder(selected_columns=self.cat_col)), ('imputer', Imputation(selected_columns=self.cat_col + self.num_col + self.time_col)), ('scaler', TabScaler(selected_columns=self.num_col)), + ('boxcox', BoxCox(selected_columns=self.num_col)), + ('binning', Binning(selected_columns=self.num_col)), + ('log_square', LogTransform(selected_columns=self.num_col)), ('pca', TabPCA(selected_columns=self.num_col)), ('time_diff', TimeDiff(selected_columns=self.time_col)), + ('time_offset', TimeOffset(selected_columns=self.time_col)), ('filter', FilterConstant(selected_columns=self.time_col + self.num_col + self.cat_col)), ('pearson_corr', FeatureFilter(selected_columns=self.time_col + self.num_col + self.cat_col)), ('lgbm_feat_selection', FeatureImportance(selected_columns=self.time_col + self.num_col + self.cat_col)), @@ -114,7 +128,10 @@ def transform(self, raw_x, time_limit=None): # Convert sparse to dense if needed raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x - return self.pipeline.transform(raw_x) + + # To pandas + raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) + return self.pipeline.transform(raw_x).values @staticmethod def extract_data_info(raw_x): @@ -165,6 +182,7 @@ def __init__(self, selected_columns=[], selected_type=None): self.selected_type = selected_type def fit(self, X, y=None): + self.selected = list(set(X.columns) & set(self.selected)) if not self.selected: return self return self._fit(X, y) @@ -188,11 +206,11 @@ class TabScaler(Primitive): def _fit(self, X, y=None): self.scaler = StandardScaler() - self.scaler.fit(X[:, self.selected], y) + self.scaler.fit(X[self.selected], y) return self def _transform(self, X, y=None): - X[:, self.selected] = self.scaler.transform(X[:, self.selected]) + X[self.selected] = self.scaler.transform(X[self.selected]) return X @@ -201,11 +219,11 @@ class BoxCox(Primitive): def _fit(self, X, y=None): self.transformer = PowerTransformer() - self.transformer.fit(X[:, self.selected], y) + self.transformer.fit(X[self.selected], y) return self def _transform(self, X, y=None): - X[:, self.selected] = self.transformer.transform(X[:, self.selected]) + X[self.selected] = self.transformer.transform(X[self.selected]) return X @@ -219,18 +237,19 @@ def __init__(self, selected_columns=[], selected_type=None, strategy='quantile', def _fit(self, X, y=None): self.binner = KBinsDiscretizer(strategy=self.strategy, encode=self.encoding) - self.binner.fit(X[:, self.selected], y) + self.binner.fit(X[self.selected], y) return self def _transform(self, X, y=None): - X[:, self.selected] = self.binner.transform(X[:, self.selected]) + X[self.selected] = self.binner.transform(X[self.selected]) return X class CatEncoder(Primitive): - cat_to_int_label = {} + cat_to_int_label = None def _fit(self, X, y=None): + self.cat_to_int_label = {} for col_index in self.selected: self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) for row_index in range(len(X)): @@ -247,35 +266,141 @@ def _transform(self, X, y=None): return X -# class TargetEncoder(Primitive): -# cat_to_int_label = {} -# -# def _fit(self, X, y=None): -# for col_index in self.selected: -# self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) -# for row_index in range(len(X)): -# key = str(X[row_index, col_index]) -# if key not in self.cat_to_int_label[col_index]: -# self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index]) -# return self -# -# def _transform(self, X, y=None): -# for col_index in self.selected: -# for row_index in range(len(X)): -# key = str(X[row_index, col_index]) -# X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan) -# return X +class TargetEncoder(Primitive): + target_encoding_map = None + + @staticmethod + def calc_smooth_mean(df, by, on, alpha=5): + # Compute the global mean + mean = df[on].mean() + + # Compute the number of values and the mean of each group + agg = df.groupby(by)[on].agg(['count', 'mean']) + counts = agg['count'] + means = agg['mean'] + + # Compute the "smoothed" means + smooth = (counts * means + alpha * mean) / (counts + alpha) + return smooth + + def _fit(self, X, y=None): + self.target_encoding_map = {} + X['target'] = y + for col in self.selected: + self.target_encoding_map[col] = self.calc_smooth_mean(X, col, 'target', alpha=5) + X.drop('target', axis=1, inplace=True) + return self + + def _transform(self, X, y=None): + for col in self.selected: + X[col] = X[col].map(self.target_encoding_map[col]) + return X + + +class CatCatEncoder(Primitive): + def __init__(self, selected_columns=[], selected_type=None, strategy='count'): + super().__init__(selected_columns, selected_type) + self.strategy = strategy + self.cat_cat_map = {} + + @staticmethod + def cat_cat_count(df, col1, col2, strategy='count'): + if strategy == 'count': + mapping = df.groupby([col1])[col2].count() + elif strategy == 'nunique': + mapping = df.groupby([col1])[col2].nunique() + else: + mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[col2].nunique() + return mapping + + def _fit(self, X, y=None): + for col1, col2 in itertools.combinations(self.selected, 2): + self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(X, col1, col2, self.strategy) + return self + + def _transform(self, X, y=None): + for col1, col2 in itertools.combinations(self.selected, 2): + if col1 + '_cross_' + col2 in self.cat_cat_map: + X[col1 + '_cross_' + col2] = X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2]) + return X + + +class CatNumEncoder(Primitive): + def __init__(self, selected_columns=[], selected_type=None, selected_num=[], strategy='mean'): + super().__init__(selected_columns, selected_type) + self.selected_num = selected_num + self.strategy = strategy + self.cat_num_map = {} + + @staticmethod + def cat_num_interaction(df, col1, col2, method='mean'): + if method == 'mean': + mapping = df.groupby([col1])[col2].mean() + elif method == 'std': + mapping = df.groupby([col1])[col2].std() + elif method == 'max': + mapping = df.groupby([col1])[col2].max() + elif method == 'min': + mapping = df.groupby([col1])[col2].min() + else: + mapping = df.groupby([col1])[col2].mean() + + return mapping + + def _fit(self, X, y=None): + for col1 in self.selected: + for col2 in self.selected_num: + self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(X, col1, col2, self.strategy) + return self + + def _transform(self, X, y=None): + for col1 in self.selected: + for col2 in self.selected_num: + if col1 + '_cross_' + col2 in self.cat_num_map: + X[col1 + '_cross_' + col2] = X[col1].map(self.cat_num_map[col1 + '_cross_' + col2]) + return X + + +class CatBinEncoder(Primitive): + def __init__(self, selected_columns=[], selected_type=None, selected_bin=[], strategy='percent_true'): + super().__init__(selected_columns, selected_type) + self.selected_bin = selected_bin + self.strategy = strategy + self.cat_bin_map = {} + + @staticmethod + def cat_bin_interaction(df, col1, col2, strategy='percent_true'): + if strategy == 'percent_true': + mapping = df.groupby([col1])[col2].mean() + elif strategy == 'count': + mapping = df.groupby([col1])[col2].count() + else: + mapping = df.groupby([col1])[col2].mean() + return mapping + + def _fit(self, X, y=None): + for col1 in self.selected: + for col2 in self.selected_bin: + self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(X, col1, col2, self.strategy) + return self + + def _transform(self, X, y=None): + for col1 in self.selected: + for col2 in self.selected_bin: + if col1 + '_cross_' + col2 in self.cat_bin_map: + X[col1 + '_cross_' + col2] = X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2]) + return X class FilterConstant(Primitive): - selected_cols = [] + selected_cols = None def _fit(self, X, y=None): - self.selected_cols = np.where(np.max(X, axis=0) - np.min(X, axis=0) != 0)[0] + self.selected_cols = X.columns[(X.max(axis=0) - X.min(axis=0) != 0)].tolist() return self def _transform(self, X, y=None): - return X[:, self.selected_cols] + return X[self.selected_cols] class TimeDiff(Primitive): @@ -284,11 +409,8 @@ def _fit(self, X, y=None): return self def _transform(self, X, y=None): - x_time = X[:, self.selected] - len_cols = x_time.shape[1] - for i in range(len_cols): - for j in range(i + 1, len_cols): - X = np.append(X, np.expand_dims(x_time[:, i] - x_time[:, j], 1), 1) + for a, b in itertools.combinations(self.selected, 2): + X[a + '-' + b] = X[a] - X[b] return X @@ -296,7 +418,7 @@ class TimeOffset(Primitive): start_time = None def _fit(self, X, y=None): - self.start_time = np.min(X[self.selected], axis=0) + self.start_time = X[self.selected].min(axis=0) return self def _transform(self, X, y=None): @@ -309,26 +431,27 @@ class TabPCA(Primitive): def _fit(self, X, y=None): self.pca = PCA(n_components=0.99, svd_solver='full') - self.pca.fit(X[:, self.selected]) + self.pca.fit(X[self.selected]) return self def _transform(self, X, y=None): - x_pca = self.pca.transform(X[:, self.selected]) - return np.concatenate([X, x_pca], axis=1) + x_pca = self.pca.transform(X[self.selected]) + x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in range(x_pca.shape[1])]) + return pd.concat([X, x_pca], axis=1) class CatCount(Primitive): - count_dict = {} + count_dict = None def _fit(self, X, y=None): + self.count_dict = {} for col in self.selected: - self.count_dict[col] = collections.Counter(X[:, col]) + self.count_dict[col] = collections.Counter(X[col]) return self def _transform(self, X, y=None): for col in self.selected: - gen_freq = np.vectorize(lambda key: self.count_dict[col][key]) - X[:, col] = gen_freq(X[:, col]) + X[col] = X[col].apply(lambda key: self.count_dict[col][key]) return X @@ -339,21 +462,24 @@ def _fit(self, X, y=None): def _transform(self, X, y=None): for col in self.selected: - X[:, col] = np.square(np.log(X[:, col])) + X[col] = np.square(np.log(1 + X[col])) return X class Imputation(Primitive): - imputer = None + impute_dict = None def _fit(self, X, y=None): - # TODO implement most_frequent - self.imputer = SimpleImputer(strategy='constant', fill_value=0) - self.imputer.fit(X) + self.impute_dict = {} + for col in self.selected: + value_counts = X[col].value_counts() + self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0 return self def _transform(self, X, y=None): - return self.imputer.transform(X) + for col in self.selected: + X[col] = X[col].fillna(self.impute_dict[col]) + return X class FeatureFilter(Primitive): @@ -364,7 +490,7 @@ def __init__(self, selected_columns=[], selected_type=None, threshold=0.001): def _fit(self, X, y=None): for col in self.selected: - mu = abs(pearsonr(X[:, col], y)[0]) + mu = abs(pearsonr(X[col], y)[0]) if np.isnan(mu): mu = 0 if mu < self.threshold: @@ -372,7 +498,7 @@ def _fit(self, X, y=None): return self def _transform(self, X, y=None): - X = np.delete(X, self.drop_columns, axis=1) + X.drop(columns=self.drop_columns, inplace=True) return X @@ -397,7 +523,8 @@ def _fit(self, X, y=None): n_jobs=1, num_class=n_classes, objective='multiclass') - elif self.task_type == 'regression': + else: + # self.task_type == 'regression' estimator = LGBMRegressor(silent=False, verbose=-1, n_jobs=1, @@ -405,11 +532,11 @@ def _fit(self, X, y=None): estimator.fit(X, y) feature_importance = estimator.feature_importances_ feature_importance = feature_importance/feature_importance.mean() - self.drop_columns = np.where(feature_importance < self.threshold)[0] + self.drop_columns = X.columns[np.where(feature_importance < self.threshold)[0]] return self def _transform(self, X, y=None): - X = np.delete(X, self.drop_columns, axis=1) + X.drop(columns=self.drop_columns, inplace=True) return X diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 30d8dec..317bef4 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -285,6 +285,6 @@ def get_box_plot(data, task_id, metric): # ak.run_automation(classification_task_list) import time t1 = time.time() - ak.evaluate(45) + ak.evaluate(3020) t2 = time.time() - print(t2-t1) \ No newline at end of file + print(t2-t1) From e10541ddcb2fcbd0b8a9d061a3e9ba7b78d1e851 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Wed, 4 Sep 2019 19:35:37 -0500 Subject: [PATCH 13/31] Use Tabular Data --- autokaggle/preprocessor.py | 93 ++++++++++++++++---------------------- examples/benchmarking.py | 2 +- 2 files changed, 40 insertions(+), 55 deletions(-) diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index bead35b..f0c655b 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd import scipy -import math import itertools from scipy.stats import pearsonr from sklearn.decomposition import PCA @@ -17,10 +16,11 @@ class TabularPreprocessor: - def __init__(self): + def __init__(self, verbose=True): """ Initialization function for tabular preprocessor. """ + self.verbose = verbose self.num_cat_pair = {} self.total_samples = 0 @@ -61,30 +61,13 @@ def fit(self, raw_x, y, time_limit, data_info): self.budget = time_limit # Extract or read data info self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x) - print('DATA_INFO: {}'.format(self.data_info)) # Set the meta info for each data type - self.n_time = sum(self.data_info == 'TIME') - self.n_num = sum(self.data_info == 'NUM') - self.n_cat = sum(self.data_info == 'CAT') - self.total_samples = raw_x.shape[0] - - self.cat_col = list(np.where(self.data_info == 'CAT')[0]) - self.num_col = list(np.where(self.data_info == 'NUM')[0]) - self.time_col = list(np.where(self.data_info == 'TIME')[0]) - self.cat_col = [str(i) for i in self.cat_col] - self.num_col = [str(i) for i in self.num_col] - self.time_col = [str(i) for i in self.time_col] - - print('#TIME features: {}'.format(self.n_time)) - print('#NUM features: {}'.format(self.n_num)) - print('#CAT features: {}'.format(self.n_cat)) - - # Convert sparse to dense if needed - raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x + self.cat_col = [str(i) for i in np.where(self.data_info == 'CAT')[0]] + self.num_col = [str(i) for i in np.where(self.data_info == 'NUM')[0]] + self.time_col = [str(i) for i in np.where(self.data_info == 'TIME')[0]] - # To pandas - raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) + data = TabularData(raw_x, self.data_info, self.verbose) self.pipeline = Pipeline([ # ('cat_num_encoder', CatNumEncoder(selected_columns=self.cat_col, selected_num=self.num_col)), @@ -102,7 +85,7 @@ def fit(self, raw_x, y, time_limit, data_info): ('pearson_corr', FeatureFilter(selected_columns=self.time_col + self.num_col + self.cat_col)), ('lgbm_feat_selection', FeatureImportance(selected_columns=self.time_col + self.num_col + self.cat_col)), ]) - self.pipeline.fit(raw_x, y) + self.pipeline.fit(data.X, y) return self @@ -126,12 +109,8 @@ def transform(self, raw_x, time_limit=None): else: self.budget = time_limit - # Convert sparse to dense if needed - raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x - - # To pandas - raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) - return self.pipeline.transform(raw_x).values + data = TabularData(raw_x, self.data_info, self.verbose) + return self.pipeline.transform(data.X).values @staticmethod def extract_data_info(raw_x): @@ -153,27 +132,33 @@ def extract_data_info(raw_x): class TabularData: - def __init__(self, data, data_info): - self.data = data + def __init__(self, raw_x, data_info, verbose=True): self.data_info = data_info + self.verbose = verbose + self.cat_col = [str(i) for i in np.where(self.data_info == 'CAT')[0]] + self.num_col = [str(i) for i in np.where(self.data_info == 'NUM')[0]] + self.time_col = [str(i) for i in np.where(self.data_info == 'TIME')[0]] + + self.n_time = len(self.time_col) + self.n_num = len(self.num_col) + self.n_cat = len(self.cat_col) + self.total_samples = raw_x.shape[0] - self.total_samples = 0 - - self.cat_to_int_label = {} - self.n_first_batch_keys = {} - self.high_level_cat_keys = [] + # Convert sparse to dense if needed + raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x - self.num_cat_pair = {} - self.feature_add_high_cat = 0 - self.feature_add_cat_num = 10 - self.feature_add_cat_cat = 10 - self.order_num_cat_pair = {} + # To pandas + if type(raw_x) != pd.DataFrame: + raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) - self.selected_cols = None + self.X = raw_x + self.cat_cardinality = {} - self.n_time = None - self.n_num = None - self.n_cat = None + if self.verbose: + print('DATA_INFO: {}'.format(self.data_info)) + print('#TIME features: {}'.format(self.n_time)) + print('#NUM features: {}'.format(self.n_num)) + print('#CAT features: {}'.format(self.n_cat)) class Primitive(BaseEstimator, TransformerMixin): @@ -181,23 +166,23 @@ def __init__(self, selected_columns=[], selected_type=None): self.selected = selected_columns self.selected_type = selected_type - def fit(self, X, y=None): - self.selected = list(set(X.columns) & set(self.selected)) + def fit(self, data, y=None): + self.selected = list(set(data.columns) & set(self.selected)) if not self.selected: return self - return self._fit(X, y) + return self._fit(data, y) - def transform(self, X, y=None): + def transform(self, data, y=None): if not self.selected: - return X - return self._transform(X, y) + return data + return self._transform(data, y) @abstractmethod - def _fit(self, X, y=None): + def _fit(self, data, y=None): pass @abstractmethod - def _transform(self, X, y=None): + def _transform(self, data, y=None): pass diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 317bef4..50bcfbb 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -285,6 +285,6 @@ def get_box_plot(data, task_id, metric): # ak.run_automation(classification_task_list) import time t1 = time.time() - ak.evaluate(3020) + ak.evaluate(3021) t2 = time.time() print(t2-t1) From 2e97035a4d037ed723c1502a337be84a3596d636 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 5 Sep 2019 10:18:34 -0500 Subject: [PATCH 14/31] Save changes --- autokaggle/ensemblers.py | 56 +++--- autokaggle/preprocessor.py | 371 ++++++++++++++++++++++--------------- 2 files changed, 257 insertions(+), 170 deletions(-) diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index e931954..1e5fe88 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -14,6 +14,8 @@ from joblib import dump, load from scipy import stats from lightgbm import LGBMClassifier, LGBMRegressor +import collections +from sklearn.model_selection import RandomizedSearchCV, cross_val_score class RankedEnsembler: @@ -73,52 +75,59 @@ def predict(self, X): class EnsembleSelection: - def __init__(self, ensemble_size=25): - self.ensemble_size = ensemble_size - - def fit(self, predictions, y_true): + indices_ = None + weights_ = None + + def __init__(self, estimator_list, objective, ensemble_size=25): + self.estimator_list = estimator_list + self.objective = objective + self.indices_, self.weights_ = [], [] + self.ensemble_size = min(len(estimator_list), ensemble_size) + if self.objective == 'regression': + self.score_metric = 'neg_mean_squared_error' + self.skf = KFold(n_splits=3, shuffle=True, random_state=1001) + else: + self.score_metric = 'neg_mean_squared_error' + self.skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1001) + + def fit(self, X, y_true): """Rich Caruana's ensemble selection method. (fast version)""" ensemble = [] trajectory = [] order = [] for i in range(self.ensemble_size): - scores = np.zeros((len(predictions))) + scores = np.zeros((len(self.estimator_list))) s = len(ensemble) if s == 0: - weighted_ensemble_prediction = np.zeros(predictions[0].shape) + weighted_ensemble_prediction = np.zeros(len(self.estimator_list)) else: # Memory-efficient averaging! ensemble_prediction = np.zeros(ensemble[0].shape) - for pred in ensemble: - ensemble_prediction += pred + for est in ensemble: + ensemble_prediction += est ensemble_prediction /= s - weighted_ensemble_prediction = (s / float(s + 1)) * \ - ensemble_prediction + weighted_ensemble_prediction = (s / float(s + 1)) * ensemble_prediction fant_ensemble_prediction = np.zeros(weighted_ensemble_prediction.shape) - for j, pred in enumerate(predictions): + for j, est in enumerate(self.estimator_list): fant_ensemble_prediction[:,:] = weighted_ensemble_prediction + \ - (1. / float(s + 1)) * pred - scores[j] = calculate_score( - solution=labels, - prediction=fant_ensemble_prediction, - task_type=self.task_type, - metric=self.metric, - all_scoring_functions=False) + (1. / float(s + 1)) * est + scores[j] = cross_val_score(self.estimator_list[j], X, y_true, scoring=self.score_metric, + cv=self.skf).mean() all_best = np.argwhere(scores == np.nanmin(scores)).flatten() best = np.random.RandomState.choice(all_best) - ensemble.append(predictions[best]) + ensemble.append(self.estimator_list[best]) trajectory.append(scores[best]) order.append(best) # Handle special case - if len(predictions) == 1: + if len(self.estimator_list) == 1: break self.indices_ = order - ensemble_members = Counter(self.indices_).most_common() + ensemble_members = collections.Counter(self.indices_) weights = np.zeros((self.ensemble_size,), dtype=float) for ensemble_member in ensemble_members: weight = float(ensemble_member[1]) / self.ensemble_size @@ -128,7 +137,6 @@ def fit(self, predictions, y_true): weights = weights / np.sum(weights) self.weights_ = weights - self._calculate_weights() - def predict(self, predictions): - return np.average(predictions, axis=1, weights=self.weights_) + def predict(self, X): + return np.average(X, axis=1, weights=self.weights_) diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index f0c655b..d2ac6d8 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -21,11 +21,8 @@ def __init__(self, verbose=True): Initialization function for tabular preprocessor. """ self.verbose = verbose - self.num_cat_pair = {} self.total_samples = 0 - - self.cat_to_int_label = {} self.n_first_batch_keys = {} self.high_level_cat_keys = [] @@ -34,15 +31,8 @@ def __init__(self, verbose=True): self.feature_add_cat_cat = 10 self.order_num_cat_pair = {} - self.selected_cols = None self.budget = None self.data_info = None - self.n_time = None - self.n_num = None - self.n_cat = None - self.cat_col = None - self.num_col = None - self.time_col = None self.pipeline = None def fit(self, raw_x, y, time_limit, data_info): @@ -62,30 +52,27 @@ def fit(self, raw_x, y, time_limit, data_info): # Extract or read data info self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x) - # Set the meta info for each data type - self.cat_col = [str(i) for i in np.where(self.data_info == 'CAT')[0]] - self.num_col = [str(i) for i in np.where(self.data_info == 'NUM')[0]] - self.time_col = [str(i) for i in np.where(self.data_info == 'TIME')[0]] - data = TabularData(raw_x, self.data_info, self.verbose) self.pipeline = Pipeline([ - # ('cat_num_encoder', CatNumEncoder(selected_columns=self.cat_col, selected_num=self.num_col)), - ('cat_encoder', TargetEncoder(selected_columns=self.cat_col)), - # ('cat_cat_encoder', CatCatEncoder(selected_columns=self.cat_col)), - ('imputer', Imputation(selected_columns=self.cat_col + self.num_col + self.time_col)), - ('scaler', TabScaler(selected_columns=self.num_col)), - ('boxcox', BoxCox(selected_columns=self.num_col)), - ('binning', Binning(selected_columns=self.num_col)), - ('log_square', LogTransform(selected_columns=self.num_col)), - ('pca', TabPCA(selected_columns=self.num_col)), - ('time_diff', TimeDiff(selected_columns=self.time_col)), - ('time_offset', TimeOffset(selected_columns=self.time_col)), - ('filter', FilterConstant(selected_columns=self.time_col + self.num_col + self.cat_col)), - ('pearson_corr', FeatureFilter(selected_columns=self.time_col + self.num_col + self.cat_col)), - ('lgbm_feat_selection', FeatureImportance(selected_columns=self.time_col + self.num_col + self.cat_col)), + ('imputer', Imputation(selected_type='ALL', operation='upd')), + ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')), + ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')), + ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')), + ('count_encoder', CatCount(selected_type='CAT', operation='add')), + ('label_encoder', CatEncoder(selected_type='CAT', operation='add')), + ('boxcox', BoxCox(selected_type='NUM', operation='upd')), + ('log_square', LogTransform(selected_type='NUM', operation='upd')), + ('scaler', TabScaler(selected_type='NUM', operation='upd')), + ('binning', Binning(selected_type='NUM', operation='add')), + ('pca', TabPCA(selected_type='NUM', operation='add')), + ('time_diff', TimeDiff(selected_type='TIME', operation='upd')), + ('time_offset', TimeOffset(selected_type='TIME', operation='upd')), + ('filter', FilterConstant(selected_type='ALL', operation='del')), + ('pearson_corr', FeatureFilter(selected_type='ALL', operation='del')), + ('lgbm_feat_selection', FeatureImportance(selected_type='ALL', operation='del')), ]) - self.pipeline.fit(data.X, y) + self.pipeline.fit(data, y) return self @@ -110,7 +97,7 @@ def transform(self, raw_x, time_limit=None): self.budget = time_limit data = TabularData(raw_x, self.data_info, self.verbose) - return self.pipeline.transform(data.X).values + return self.pipeline.transform(data).X.values @staticmethod def extract_data_info(raw_x): @@ -132,27 +119,31 @@ def extract_data_info(raw_x): class TabularData: + cat_col = None + num_col = None + time_col = None + n_cat, n_time, n_num = 0, 0, 0 + cat_cardinality = None + generated_features = None + feature_options = None + num_info = None + def __init__(self, raw_x, data_info, verbose=True): - self.data_info = data_info self.verbose = verbose - self.cat_col = [str(i) for i in np.where(self.data_info == 'CAT')[0]] - self.num_col = [str(i) for i in np.where(self.data_info == 'NUM')[0]] - self.time_col = [str(i) for i in np.where(self.data_info == 'TIME')[0]] - - self.n_time = len(self.time_col) - self.n_num = len(self.num_col) - self.n_cat = len(self.cat_col) + self.data_info = {str(i): data_info[i] for i in range(len(data_info))} self.total_samples = raw_x.shape[0] + self.refresh_col_types() # Convert sparse to dense if needed raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x - # To pandas + # To pandas Dataframe if type(raw_x) != pd.DataFrame: raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) self.X = raw_x self.cat_cardinality = {} + self.update_cat_cardinality() if self.verbose: print('DATA_INFO: {}'.format(self.data_info)) @@ -160,14 +151,70 @@ def __init__(self, raw_x, data_info, verbose=True): print('#NUM features: {}'.format(self.n_num)) print('#CAT features: {}'.format(self.n_cat)) + def update_type(self, columns, new_type): + if not new_type: + return + for c in columns: + self.data_info[c] = new_type + + def delete_type(self, columns): + for c in columns: + _ = self.data_info.pop(c, 0) + + def update(self, operation, columns, x_tr, new_type=None): + if operation == 'upd': + if x_tr is not None: + self.X[columns] = x_tr + self.update_type(columns, new_type) + elif operation == 'add': + if x_tr is not None: + self.X = pd.concat([self.X, x_tr], axis=1) + self.update_type(x_tr.columns, new_type) + elif operation == 'del': + if len(columns) != 0: + self.X.drop(columns, inplace=True) + self.delete_type(columns) + else: + print("invalid operation") + self.refresh_col_types() + + def refresh_col_types(self): + self.cat_col = [k for k, v in self.data_info.items() if v == 'CAT'] + self.num_col = [k for k, v in self.data_info.items() if v == 'NUM'] + self.time_col = [k for k, v in self.data_info.items() if v == 'TIME'] + self.n_time = len(self.time_col) + self.n_num = len(self.num_col) + self.n_cat = len(self.cat_col) + + def update_cat_cardinality(self): + for c in self.cat_col: + self.cat_cardinality[c] = len(set(self.X[c])) + + def select_columns(self, data_type): + self.refresh_col_types() + if data_type == 'CAT': + return self.cat_col + elif data_type == 'TIME': + return self.time_col + elif data_type == 'NUM': + return self.num_col + elif data_type == 'ALL': + return list(self.data_info.keys()) + else: + print('invalid Type') + return [] + class Primitive(BaseEstimator, TransformerMixin): - def __init__(self, selected_columns=[], selected_type=None): - self.selected = selected_columns + selected = None + drop_columns = None + + def __init__(self, selected_type=None, operation='upd', **kwargs): self.selected_type = selected_type + self.operation = operation def fit(self, data, y=None): - self.selected = list(set(data.columns) & set(self.selected)) + self.selected = data.select_columns(self.selected_type) if not self.selected: return self return self._fit(data, y) @@ -186,54 +233,67 @@ def _transform(self, data, y=None): pass +class PrimitiveHigherOrder(Primitive): + def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs): + self.selected_type1 = selected_type1 + self.selected_type2 = selected_type2 + self.operation = operation + self.options = kwargs + + class TabScaler(Primitive): scaler = None - def _fit(self, X, y=None): + def _fit(self, data, y=None): self.scaler = StandardScaler() - self.scaler.fit(X[self.selected], y) + self.scaler.fit(data.X[self.selected], y) return self - def _transform(self, X, y=None): - X[self.selected] = self.scaler.transform(X[self.selected]) - return X + def _transform(self, data, y=None): + x_tr = self.scaler.transform(data.X[self.selected]) + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data class BoxCox(Primitive): transformer = None - def _fit(self, X, y=None): + def _fit(self, data, y=None): self.transformer = PowerTransformer() - self.transformer.fit(X[self.selected], y) + self.transformer.fit(data.X[self.selected], y) return self - def _transform(self, X, y=None): - X[self.selected] = self.transformer.transform(X[self.selected]) - return X + def _transform(self, data, y=None): + x_tr = self.transformer.transform(data.X[self.selected]) + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data class Binning(Primitive): binner = None - def __init__(self, selected_columns=[], selected_type=None, strategy='quantile', encoding='ordinal'): - super().__init__(selected_columns, selected_type) + def __init__(self, selected_type=None, operation='upd', strategy='quantile', encoding='ordinal'): + super().__init__(selected_type, operation) self.strategy = strategy self.encoding = encoding - def _fit(self, X, y=None): + def _fit(self, data, y=None): self.binner = KBinsDiscretizer(strategy=self.strategy, encode=self.encoding) - self.binner.fit(X[self.selected], y) + self.binner.fit(data.X[self.selected], y) return self - def _transform(self, X, y=None): - X[self.selected] = self.binner.transform(X[self.selected]) - return X + def _transform(self, data, y=None): + x_tr = self.binner.transform(data.X[self.selected]) + # TODO: decide if cat or num new type + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data class CatEncoder(Primitive): cat_to_int_label = None - def _fit(self, X, y=None): + def _fit(self, data, y=None): + X = data.X self.cat_to_int_label = {} for col_index in self.selected: self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) @@ -243,12 +303,13 @@ def _fit(self, X, y=None): self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index]) return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): + X = data.X for col_index in self.selected: for row_index in range(len(X)): key = str(X[row_index, col_index]) X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan) - return X + return data class TargetEncoder(Primitive): @@ -268,7 +329,8 @@ def calc_smooth_mean(df, by, on, alpha=5): smooth = (counts * means + alpha * mean) / (counts + alpha) return smooth - def _fit(self, X, y=None): + def _fit(self, data, y=None): + X = data.X self.target_encoding_map = {} X['target'] = y for col in self.selected: @@ -276,18 +338,15 @@ def _fit(self, X, y=None): X.drop('target', axis=1, inplace=True) return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): + x_tr = pd.DataFrame() for col in self.selected: - X[col] = X[col].map(self.target_encoding_map[col]) - return X - + x_tr[col] = data.X[col].map(self.target_encoding_map[col]) + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data -class CatCatEncoder(Primitive): - def __init__(self, selected_columns=[], selected_type=None, strategy='count'): - super().__init__(selected_columns, selected_type) - self.strategy = strategy - self.cat_cat_map = {} +class CatCatEncoder(PrimitiveHigherOrder): @staticmethod def cat_cat_count(df, col1, col2, strategy='count'): if strategy == 'count': @@ -298,21 +357,26 @@ def cat_cat_count(df, col1, col2, strategy='count'): mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[col2].nunique() return mapping - def _fit(self, X, y=None): + def _fit(self, data, y=None): + self.cat_cat_map = {} + self.strategy = self.options.get('strategy', 'count') for col1, col2 in itertools.combinations(self.selected, 2): - self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(X, col1, col2, self.strategy) + self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(data.X, col1, col2, self.strategy) return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): + x_tr = pd.DataFrame() for col1, col2 in itertools.combinations(self.selected, 2): if col1 + '_cross_' + col2 in self.cat_cat_map: - X[col1 + '_cross_' + col2] = X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2]) - return X + x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2]) + # TODO: decide new_type + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data -class CatNumEncoder(Primitive): - def __init__(self, selected_columns=[], selected_type=None, selected_num=[], strategy='mean'): - super().__init__(selected_columns, selected_type) +class CatNumEncoder(PrimitiveHigherOrder): + def __init__(self, selected_type=None, selected_num=[], operation='add', strategy='mean'): + super().__init__(selected_type, operation) self.selected_num = selected_num self.strategy = strategy self.cat_num_map = {} @@ -332,27 +396,25 @@ def cat_num_interaction(df, col1, col2, method='mean'): return mapping - def _fit(self, X, y=None): + def _fit(self, data, y=None): + self.cat_num_map = {} + self.strategy = self.options.get('strategy', 'mean') for col1 in self.selected: for col2 in self.selected_num: - self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(X, col1, col2, self.strategy) + self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(data.X, col1, col2, self.strategy) return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): + x_tr = pd.DataFrame() for col1 in self.selected: for col2 in self.selected_num: if col1 + '_cross_' + col2 in self.cat_num_map: - X[col1 + '_cross_' + col2] = X[col1].map(self.cat_num_map[col1 + '_cross_' + col2]) - return X - + x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_num_map[col1 + '_cross_' + col2]) + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data -class CatBinEncoder(Primitive): - def __init__(self, selected_columns=[], selected_type=None, selected_bin=[], strategy='percent_true'): - super().__init__(selected_columns, selected_type) - self.selected_bin = selected_bin - self.strategy = strategy - self.cat_bin_map = {} +class CatBinEncoder(PrimitiveHigherOrder): @staticmethod def cat_bin_interaction(df, col1, col2, strategy='percent_true'): if strategy == 'percent_true': @@ -363,138 +425,155 @@ def cat_bin_interaction(df, col1, col2, strategy='percent_true'): mapping = df.groupby([col1])[col2].mean() return mapping - def _fit(self, X, y=None): + def _fit(self, data, y=None): + self.cat_bin_map = {} + self.strategy = self.options.get('strategy', 'percent_true') for col1 in self.selected: for col2 in self.selected_bin: - self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(X, col1, col2, self.strategy) + self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(data.X, col1, col2, self.strategy) return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): + x_tr = pd.DataFrame() for col1 in self.selected: for col2 in self.selected_bin: if col1 + '_cross_' + col2 in self.cat_bin_map: - X[col1 + '_cross_' + col2] = X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2]) - return X + x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2]) + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data class FilterConstant(Primitive): - selected_cols = None + drop_columns = None - def _fit(self, X, y=None): - self.selected_cols = X.columns[(X.max(axis=0) - X.min(axis=0) != 0)].tolist() + def _fit(self, data, y=None): + X = data.X + self.drop_columns = X.columns[(X.max(axis=0) - X.min(axis=0) == 0)].tolist() return self - def _transform(self, X, y=None): - return X[self.selected_cols] + def _transform(self, data, y=None): + data.update(self.operation, self.drop_columns, None, new_type=None) + return data class TimeDiff(Primitive): - def _fit(self, X, y=None): + def _fit(self, data, y=None): return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): + x_tr = pd.DataFrame() for a, b in itertools.combinations(self.selected, 2): - X[a + '-' + b] = X[a] - X[b] - return X + x_tr[a + '-' + b] = data.X[a] - data.X[b] + data.update(self.operation, self.selected, x_tr, new_type='TIME') + return data class TimeOffset(Primitive): start_time = None - def _fit(self, X, y=None): - self.start_time = X[self.selected].min(axis=0) + def _fit(self, data, y=None): + self.start_time = data.X[self.selected].min(axis=0) return self - def _transform(self, X, y=None): - X[self.selected] = X[self.selected] - self.start_time - return X + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + x_tr[self.selected] = data.X[self.selected] - self.start_time + data.update(self.operation, self.selected, x_tr, new_type='TIME') + return data class TabPCA(Primitive): pca = None - def _fit(self, X, y=None): + def _fit(self, data, y=None): self.pca = PCA(n_components=0.99, svd_solver='full') - self.pca.fit(X[self.selected]) + self.pca.fit(data.X[self.selected]) return self - def _transform(self, X, y=None): - x_pca = self.pca.transform(X[self.selected]) + def _transform(self, data, y=None): + x_pca = self.pca.transform(data.X[self.selected]) x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in range(x_pca.shape[1])]) - return pd.concat([X, x_pca], axis=1) + data.update(self.operation, self.selected, x_pca, new_type='NUM') + return data class CatCount(Primitive): count_dict = None - def _fit(self, X, y=None): + def _fit(self, data, y=None): self.count_dict = {} for col in self.selected: - self.count_dict[col] = collections.Counter(X[col]) + self.count_dict[col] = collections.Counter(data.X[col]) return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): + x_tr = pd.DataFrame() for col in self.selected: - X[col] = X[col].apply(lambda key: self.count_dict[col][key]) - return X + x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col][key]) + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data class LogTransform(Primitive): + name_key = 'log_' - def _fit(self, X, y=None): + def _fit(self, data, y=None): return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): + x_tr = pd.DataFrame() for col in self.selected: - X[col] = np.square(np.log(1 + X[col])) - return X + x_tr[self.name_key + col] = np.square(np.log(1 + data.X[col])) + data.update(self.operation, self.selected, x_tr, new_type='NUM') + return data class Imputation(Primitive): impute_dict = None - def _fit(self, X, y=None): + def _fit(self, data, y=None): self.impute_dict = {} for col in self.selected: - value_counts = X[col].value_counts() + value_counts = data.X[col].value_counts() self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0 return self - def _transform(self, X, y=None): + def _transform(self, data, y=None): for col in self.selected: - X[col] = X[col].fillna(self.impute_dict[col]) - return X + data.X[col].fillna(self.impute_dict[col]) + data.update(self.operation, self.selected, None, new_type='NUM') + return data class FeatureFilter(Primitive): - def __init__(self, selected_columns=[], selected_type=None, threshold=0.001): - super().__init__(selected_columns, selected_type) + def __init__(self, selected_type=None, operation='del', threshold=0.001): + super().__init__(selected_type, operation) self.threshold = threshold self.drop_columns = [] - def _fit(self, X, y=None): + def _fit(self, data, y=None): for col in self.selected: - mu = abs(pearsonr(X[col], y)[0]) + mu = abs(pearsonr(data.X[col], y)[0]) if np.isnan(mu): mu = 0 if mu < self.threshold: self.drop_columns.append(col) return self - def _transform(self, X, y=None): - X.drop(columns=self.drop_columns, inplace=True) - return X + def _transform(self, data, y=None): + data.update(self.operation, self.drop_columns, None, new_type=None) + return data class FeatureImportance(Primitive): - def __init__(self, selected_columns=[], selected_type=None, threshold=0.001, task_type='classification'): - super().__init__(selected_columns, selected_type) + def __init__(self, selected_type=None, operation='del', threshold=0.001, task_type='classification'): + super().__init__(selected_type, operation) self.threshold = threshold self.drop_columns = [] self.task_type = task_type - def _fit(self, X, y=None): + def _fit(self, data, y=None): if self.task_type == 'classification': n_classes = len(set(y)) if n_classes == 2: @@ -514,15 +593,15 @@ def _fit(self, X, y=None): verbose=-1, n_jobs=1, objective='regression') - estimator.fit(X, y) + estimator.fit(data.X, y) feature_importance = estimator.feature_importances_ feature_importance = feature_importance/feature_importance.mean() - self.drop_columns = X.columns[np.where(feature_importance < self.threshold)[0]] + self.drop_columns = data.X.columns[np.where(feature_importance < self.threshold)[0]] return self - def _transform(self, X, y=None): - X.drop(columns=self.drop_columns, inplace=True) - return X + def _transform(self, data, y=None): + data.update(self.operation, self.drop_columns, None, new_type=None) + return data if __name__ == "__main__": From 4c7c0fb3bf977f31cba8985cada332f72e0586cc Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Sun, 15 Sep 2019 15:48:09 -0500 Subject: [PATCH 15/31] Fix the prep pipeline --- autokaggle/auto_ml.py | 7 ++- autokaggle/estimators.py | 113 ++++++++++++++++++++--------------- autokaggle/preprocessor.py | 119 ++++++++++++++++++++++++------------- examples/benchmarking.py | 2 +- 4 files changed, 150 insertions(+), 91 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 7cfcbff..97e0ea3 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -67,13 +67,16 @@ def fit(self, x, y, time_limit=None, data_info=None): print("The whole available data is: ") print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1])) - def predict(self, x_test): + def predict(self, x_test, predict_proba=False): """ This function should provide predictions of labels on (test) data. The function predict eventually can return probabilities or continuous values. """ x_test = self.preprocessor.transform(x_test) - y = self.model.predict(x_test, ) + if predict_proba: + y = self.model.predict_proba(x_test, ) + else: + y = self.model.predict(x_test, ) if y is None: raise ValueError("Tabular predictor does not exist") return y diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index dccff50..6d34553 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -24,42 +24,46 @@ # TODO: Way to change the default hparams -knn_classifier_params = {'n_neighbors': hp.choice('n_neighbors', range(2, 20)), - 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), - 'leaf_size': hp.choice('leaf_size', range(5, 50)), - 'metric': hp.choice('metric', ["euclidean", "manhattan", - "chebyshev", "minkowski" - ]), - 'p': hp.choice('p', range(1, 4)), - } - -svc_params = {'C': hp.lognormal('C', 0, 1), - 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), - 'degree': hp.choice('degree', range(1, 6)), - 'gamma': hp.uniform('gamma', 0.001, 10000), - 'max_iter': 50000, - } - -random_forest_classifier_params = {'criterion': hp.choice('criterion', ['entropy', 'gini']), - 'max_features': hp.uniform('max_features', 0, 1.0), - 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), - 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10))} - -lgbm_classifier_params = {'boosting_type': 'gbdt', - 'min_split_gain': 0.1, - 'subsample': 0.8, - 'num_leaves': 80, - 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), - 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), - 'max_depth': hp.choice('max_depth', range(5, 10)), - 'n_estimators': hp.choice('n_estimators', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate', 0, 1), - } - -adaboost_classifier_params = {'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), - 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), - } +knn_classifier_params = { + 'n_neighbors': hp.choice('n_neighbors', range(2, 20)), + 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), + 'leaf_size': hp.choice('leaf_size', range(5, 50)), + 'metric': hp.choice('metric', ["euclidean", "manhattan", "chebyshev", "minkowski"]), + 'p': hp.choice('p', range(1, 4)), +} + +svc_params = { + 'C': hp.lognormal('C', 0, 1), + 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), + 'degree': hp.choice('degree', range(1, 6)), + 'gamma': hp.uniform('gamma', 0.001, 10000), + 'max_iter': 50000, +} + +random_forest_classifier_params = { + 'criterion': hp.choice('criterion', ['entropy', 'gini']), + 'max_features': hp.uniform('max_features', 0, 1.0), + 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), + 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) +} + +lgbm_classifier_params = { + 'boosting_type': 'gbdt', + 'min_split_gain': 0.1, + 'subsample': 0.8, + 'num_leaves': 80, + 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), + 'max_depth': hp.choice('max_depth', range(5, 10)), + 'n_estimators': hp.choice('n_estimators', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate', 0, 1), +} + +adaboost_classifier_params = { + 'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), +} extra_trees_regressor_params = { 'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)), @@ -68,17 +72,20 @@ 'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)), 'min_impurity_decrease': 0.0 } + ridge_params = { 'fit_intercept': True, 'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1), 'alpha': hp.loguniform('alpha_ridge', 1e-5, 10) } + random_forest_regressor_params = { 'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']), 'max_features': hp.uniform('max_features', 0, 1.0), 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) } + lgbm_regressor_params = { 'boosting_type': 'gbdt', 'min_split_gain': 0.1, @@ -90,6 +97,7 @@ 'n_estimators': hp.choice('n_estimators', range(50, 200)), 'learning_rate': hp.lognormal('learning_rate', 0, 1), } + adaboost_regressor_params = { 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), @@ -98,7 +106,8 @@ class TabularEstimator(BaseEstimator): - def __init__(self, path=None, verbose=True, time_limit=None): + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=25, + ensemble_strategy='ranked_ensembling', ensemble_method='max_voting'): """ Initialization function for tabular supervised learner. """ @@ -108,17 +117,17 @@ def __init__(self, path=None, verbose=True, time_limit=None): self.objective = None abs_cwd = os.path.split(os.path.abspath(__file__))[0] self.best_estimator_ = None - self.use_ensembling = False + self.use_ensembling = use_ensembling self.hparams = None - self.num_estimators_ensemble = 2 - self.ensemble_strategy = 'ranked_ensembling' - self.ensemble_method = 'max_voting' + self.num_estimators_ensemble = num_estimators_ensemble + self.ensemble_strategy = ensemble_strategy + self.ensemble_method = ensemble_method def fit(self, x, y): if self.objective == 'classification': n_classes = len(set(y)) self.objective = 'binary' if n_classes == 2 else 'multiclass' - x, y = self.resample(x, y) + # x, y = self.resample(x, y) self.best_estimator_, _ = self.search(x, y) self.best_estimator_.fit(x, y) self.save_model() @@ -126,6 +135,13 @@ def fit(self, x, y): def predict(self, x, y=None): y_pred = self.best_estimator_.predict(x, ) return y_pred + + def predict_proba(self, x, y=None): + try: + y_pred = self.best_estimator_.predict_proba(x, ) + except: + y_pred = self.best_estimator_.predict(x, ) + return y_pred @staticmethod def resample(X, y): @@ -144,13 +160,16 @@ def subsample(x, y, sample_percent): grid_train_x, grid_train_y = x[idx, :], y[idx] return grid_train_x, grid_train_y - def search(self, x, y, search_iter=4, folds=3, sample_percent=0.1): + def search(self, x, y, search_iter=100, folds=3, sample_percent=0.1): grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=sample_percent) score_metric, skf = self.get_skf(folds) def objective_func(args): clf = args['model'](**args['param']) - eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() + try: + eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() + except ValueError: + eval_score = 0 if self.verbose: print("CV Score:", eval_score) print("\n=================") @@ -206,9 +225,9 @@ def __init__(self, path=None, verbose=True, time_limit=None): {'model': LGBMClassifier, 'param': lgbm_classifier_params }, - {'model': AdaBoostClassifier, - 'param': adaboost_classifier_params - } + # {'model': AdaBoostClassifier, + # 'param': adaboost_classifier_params + # } ]) def get_skf(self, folds): diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index d2ac6d8..c42f7f6 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -22,14 +22,9 @@ def __init__(self, verbose=True): """ self.verbose = verbose - self.total_samples = 0 - self.n_first_batch_keys = {} - self.high_level_cat_keys = [] - self.feature_add_high_cat = 0 self.feature_add_cat_num = 10 self.feature_add_cat_cat = 10 - self.order_num_cat_pair = {} self.budget = None self.data_info = None @@ -56,21 +51,21 @@ def fit(self, raw_x, y, time_limit, data_info): self.pipeline = Pipeline([ ('imputer', Imputation(selected_type='ALL', operation='upd')), - ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')), - ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')), - ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')), - ('count_encoder', CatCount(selected_type='CAT', operation='add')), - ('label_encoder', CatEncoder(selected_type='CAT', operation='add')), - ('boxcox', BoxCox(selected_type='NUM', operation='upd')), - ('log_square', LogTransform(selected_type='NUM', operation='upd')), + # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')), + # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')), + # ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')), + ('count_encoder', CatCount(selected_type='CAT', operation='upd')), + # ('label_encoder', CatEncoder(selected_type='CAT', operation='add')), + # ('boxcox', BoxCox(selected_type='NUM', operation='upd')), + # ('log_square', LogTransform(selected_type='NUM', operation='upd')), ('scaler', TabScaler(selected_type='NUM', operation='upd')), - ('binning', Binning(selected_type='NUM', operation='add')), - ('pca', TabPCA(selected_type='NUM', operation='add')), - ('time_diff', TimeDiff(selected_type='TIME', operation='upd')), - ('time_offset', TimeOffset(selected_type='TIME', operation='upd')), + # ('binning', Binning(selected_type='NUM', operation='upd')), + # ('pca', TabPCA(selected_type='NUM', operation='add')), + # ('time_diff', TimeDiff(selected_type='TIME', operation='add')), + # ('time_offset', TimeOffset(selected_type='TIME', operation='upd')), ('filter', FilterConstant(selected_type='ALL', operation='del')), - ('pearson_corr', FeatureFilter(selected_type='ALL', operation='del')), - ('lgbm_feat_selection', FeatureImportance(selected_type='ALL', operation='del')), + # ('pearson_corr', FeatureFilter(selected_type='ALL', operation='del')), + # ('lgbm_feat_selection', FeatureImportance(selected_type='ALL', operation='del')), ]) self.pipeline.fit(data, y) @@ -97,7 +92,9 @@ def transform(self, raw_x, time_limit=None): self.budget = time_limit data = TabularData(raw_x, self.data_info, self.verbose) - return self.pipeline.transform(data).X.values + a = self.pipeline.transform(data).X + print(a.head()) + return a.values @staticmethod def extract_data_info(raw_x): @@ -142,7 +139,6 @@ def __init__(self, raw_x, data_info, verbose=True): raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) self.X = raw_x - self.cat_cardinality = {} self.update_cat_cardinality() if self.verbose: @@ -152,8 +148,6 @@ def __init__(self, raw_x, data_info, verbose=True): print('#CAT features: {}'.format(self.n_cat)) def update_type(self, columns, new_type): - if not new_type: - return for c in columns: self.data_info[c] = new_type @@ -161,18 +155,28 @@ def delete_type(self, columns): for c in columns: _ = self.data_info.pop(c, 0) - def update(self, operation, columns, x_tr, new_type=None): + def rename_cols(self, key): + def rename_fn(col_name): + col_name += '_' + key + while col_name in self.X.columns: + col_name += '_' + key + return col_name + return rename_fn + + def update(self, operation, columns, x_tr, new_type=None, key=''): if operation == 'upd': if x_tr is not None: self.X[columns] = x_tr - self.update_type(columns, new_type) + if new_type is not None: + self.update_type(columns, new_type) elif operation == 'add': if x_tr is not None: + x_tr = x_tr.rename(columns=self.rename_cols(key)) self.X = pd.concat([self.X, x_tr], axis=1) self.update_type(x_tr.columns, new_type) elif operation == 'del': if len(columns) != 0: - self.X.drop(columns, inplace=True) + self.X.drop(columns=columns, inplace=True) self.delete_type(columns) else: print("invalid operation") @@ -187,6 +191,8 @@ def refresh_col_types(self): self.n_cat = len(self.cat_col) def update_cat_cardinality(self): + if not self.cat_cardinality: + self.cat_cardinality = {} for c in self.cat_col: self.cat_cardinality[c] = len(set(self.X[c])) @@ -208,13 +214,22 @@ def select_columns(self, data_type): class Primitive(BaseEstimator, TransformerMixin): selected = None drop_columns = None + options = None + supported_ops = ('add', 'upd', 'del') def __init__(self, selected_type=None, operation='upd', **kwargs): self.selected_type = selected_type self.operation = operation + self.init_vars(**kwargs) + + def init_vars(self, **kwargs): + self.options = kwargs def fit(self, data, y=None): self.selected = data.select_columns(self.selected_type) + if self.operation not in self.supported_ops: + print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__)) + self.selected = None if not self.selected: return self return self._fit(data, y) @@ -234,15 +249,14 @@ def _transform(self, data, y=None): class PrimitiveHigherOrder(Primitive): - def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs): - self.selected_type1 = selected_type1 + def __init__(self, operation='upd', selected_type=None, selected_type2=None, **kwargs): + super().__init__(selected_type, operation, **kwargs) self.selected_type2 = selected_type2 - self.operation = operation - self.options = kwargs class TabScaler(Primitive): scaler = None + supported_ops = ('add', 'upd') def _fit(self, data, y=None): self.scaler = StandardScaler() @@ -257,6 +271,7 @@ def _transform(self, data, y=None): class BoxCox(Primitive): transformer = None + supported_ops = ('add', 'upd') def _fit(self, data, y=None): self.transformer = PowerTransformer() @@ -271,9 +286,11 @@ def _transform(self, data, y=None): class Binning(Primitive): binner = None + strategy = None + encoding = None + supported_ops = ('add', 'upd') - def __init__(self, selected_type=None, operation='upd', strategy='quantile', encoding='ordinal'): - super().__init__(selected_type, operation) + def init_vars(self, strategy='quantile', encoding='ordinal'): self.strategy = strategy self.encoding = encoding @@ -291,6 +308,7 @@ def _transform(self, data, y=None): class CatEncoder(Primitive): cat_to_int_label = None + supported_ops = ('add', 'upd') def _fit(self, data, y=None): X = data.X @@ -314,6 +332,7 @@ def _transform(self, data, y=None): class TargetEncoder(Primitive): target_encoding_map = None + supported_ops = ('add', 'upd') @staticmethod def calc_smooth_mean(df, by, on, alpha=5): @@ -327,7 +346,7 @@ def calc_smooth_mean(df, by, on, alpha=5): # Compute the "smoothed" means smooth = (counts * means + alpha * mean) / (counts + alpha) - return smooth + return smooth, mean def _fit(self, data, y=None): X = data.X @@ -341,12 +360,13 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: - x_tr[col] = data.X[col].map(self.target_encoding_map[col]) - data.update(self.operation, self.selected, x_tr, new_type='NUM') + x_tr[col] = data.X[col].map(self.target_encoding_map[col][0], self.target_encoding_map[col][1]) + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.__class__.__name__) return data class CatCatEncoder(PrimitiveHigherOrder): + supported_ops = ('add', 'upd') @staticmethod def cat_cat_count(df, col1, col2, strategy='count'): if strategy == 'count': @@ -375,6 +395,8 @@ def _transform(self, data, y=None): class CatNumEncoder(PrimitiveHigherOrder): + supported_ops = ('add', 'upd') + def __init__(self, selected_type=None, selected_num=[], operation='add', strategy='mean'): super().__init__(selected_type, operation) self.selected_num = selected_num @@ -415,6 +437,8 @@ def _transform(self, data, y=None): class CatBinEncoder(PrimitiveHigherOrder): + supported_ops = ('add', 'upd') + @staticmethod def cat_bin_interaction(df, col1, col2, strategy='percent_true'): if strategy == 'percent_true': @@ -445,9 +469,10 @@ def _transform(self, data, y=None): class FilterConstant(Primitive): drop_columns = None + supported_ops = ('del',) def _fit(self, data, y=None): - X = data.X + X = data.X[self.selected] self.drop_columns = X.columns[(X.max(axis=0) - X.min(axis=0) == 0)].tolist() return self @@ -457,6 +482,7 @@ def _transform(self, data, y=None): class TimeDiff(Primitive): + supported_ops = ('add', ) def _fit(self, data, y=None): return self @@ -471,6 +497,7 @@ def _transform(self, data, y=None): class TimeOffset(Primitive): start_time = None + supported_ops = ('add', 'upd') def _fit(self, data, y=None): self.start_time = data.X[self.selected].min(axis=0) @@ -485,6 +512,7 @@ def _transform(self, data, y=None): class TabPCA(Primitive): pca = None + supported_ops = ('add', ) def _fit(self, data, y=None): self.pca = PCA(n_components=0.99, svd_solver='full') @@ -500,6 +528,7 @@ def _transform(self, data, y=None): class CatCount(Primitive): count_dict = None + supported_ops = ('add', 'upd') def _fit(self, data, y=None): self.count_dict = {} @@ -517,6 +546,7 @@ def _transform(self, data, y=None): class LogTransform(Primitive): name_key = 'log_' + supported_ops = ('add', 'upd') def _fit(self, data, y=None): return self @@ -531,6 +561,7 @@ def _transform(self, data, y=None): class Imputation(Primitive): impute_dict = None + supported_ops = ('add', 'upd') def _fit(self, data, y=None): self.impute_dict = {} @@ -540,15 +571,18 @@ def _fit(self, data, y=None): return self def _transform(self, data, y=None): + x_tr = pd.DataFrame() for col in self.selected: - data.X[col].fillna(self.impute_dict[col]) - data.update(self.operation, self.selected, None, new_type='NUM') + x_tr[col] = data.X[col].fillna(self.impute_dict[col]) + data.update(self.operation, self.selected, x_tr, new_type=None) return data class FeatureFilter(Primitive): - def __init__(self, selected_type=None, operation='del', threshold=0.001): - super().__init__(selected_type, operation) + threshold = None + supported_ops = ('del',) + + def init_vars(self, threshold=0.001): self.threshold = threshold self.drop_columns = [] @@ -567,8 +601,11 @@ def _transform(self, data, y=None): class FeatureImportance(Primitive): - def __init__(self, selected_type=None, operation='del', threshold=0.001, task_type='classification'): - super().__init__(selected_type, operation) + threshold = None + task_type = 'classification' + supported_ops = ('del',) + + def init_vars(self, threshold=0.001, task_type='classification'): self.threshold = threshold self.drop_columns = [] self.task_type = task_type diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 50bcfbb..b057257 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -285,6 +285,6 @@ def get_box_plot(data, task_id, metric): # ak.run_automation(classification_task_list) import time t1 = time.time() - ak.evaluate(3021) + ak.evaluate(2071) t2 = time.time() print(t2-t1) From 08f4a4ded758d337eca202ec1175413ab5c08072 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Sun, 15 Sep 2019 21:26:16 -0500 Subject: [PATCH 16/31] Added global config/fixed label encoder --- autokaggle/auto_ml.py | 32 +++++----- autokaggle/ensemblers.py | 1 - autokaggle/estimators.py | 110 +++++++++++++++++++++------------ autokaggle/preprocessor.py | 60 +++++++----------- examples/benchmarking.py | 2 +- examples/tabular_regression.py | 2 +- 6 files changed, 106 insertions(+), 101 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 97e0ea3..b726f5c 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -13,20 +13,16 @@ class AutoKaggle(BaseEstimator): - def __init__(self, estimator=Classifier(), preprocessor=TabularPreprocessor(), path=None, verbose=True): + def __init__(self, objective='classification', config=Config()): """ Initialization function for tabular supervised learner. """ - self.verbose = verbose self.is_trained = False - self.objective = None - self.preprocessor = preprocessor - self.model = estimator - self.path = path if path is not None else rand_temp_folder_generator() - ensure_dir(self.path) - if self.verbose: - print('Path:', path) - self.time_limit = None + self.config = config + self.config.objective = objective + self.preprocessor = TabularPreprocessor(config) + # TODO find elegant way of specifying classification or regression + self.model = Classifier(config) if objective == 'classification' else Regressor(config) def fit(self, x, y, time_limit=None, data_info=None): """ @@ -44,7 +40,7 @@ def fit(self, x, y, time_limit=None, data_info=None): you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. """ - self.time_limit = time_limit if time_limit else 24 * 60 * 60 + self.config.time_limit = time_limit if time_limit else 24 * 60 * 60 if x.shape[1] == 0: raise ValueError("No feature exist!") @@ -58,12 +54,12 @@ def fit(self, x, y, time_limit=None, data_info=None): # self.preprocessor = TabularPreprocessor() # Fit Model and preprocessor - self.preprocessor.fit(x, y, self.time_limit, data_info) + self.preprocessor.fit(x, y, data_info) x = self.preprocessor.transform(x) self.model.fit(x, y) self.is_trained = True - if self.verbose: + if self.config.verbose: print("The whole available data is: ") print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1])) @@ -82,15 +78,15 @@ def predict(self, x_test, predict_proba=False): return y def evaluate(self, x_test, y_test): - if self.verbose: - print('objective:', self.model.objective) + if self.config.verbose: + print('objective:', self.config.objective) y_pred = self.predict(x_test) results = None - if self.model.objective == 'binary': + if self.config.objective == 'binary': results = roc_auc_score(y_test, y_pred) - elif self.model.objective == 'multiclass': + elif self.config.objective == 'multiclass': results = f1_score(y_test, y_pred, average='weighted') - elif self.model.objective == 'regression': + elif self.config.objective == 'regression': results = mean_squared_error(y_test, y_pred) return results diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index 1e5fe88..6f6fd5d 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -1,5 +1,4 @@ from sklearn.base import BaseEstimator -from autokaggle.preprocessor import TabularPreprocessor from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json from abc import abstractmethod import numpy as np diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 6d34553..45b90b7 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -105,28 +105,54 @@ } -class TabularEstimator(BaseEstimator): +class Config(BaseEstimator): def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=25, - ensemble_strategy='ranked_ensembling', ensemble_method='max_voting'): - """ - Initialization function for tabular supervised learner. - """ + ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3, + subsample_ratio=0.1): self.verbose = verbose - self.path = path + self.path = path if path is not None else rand_temp_folder_generator() + ensure_dir(self.path) + if self.verbose: + print('Path:', path) self.time_limit = time_limit self.objective = None - abs_cwd = os.path.split(os.path.abspath(__file__))[0] - self.best_estimator_ = None self.use_ensembling = use_ensembling self.hparams = None self.num_estimators_ensemble = num_estimators_ensemble self.ensemble_strategy = ensemble_strategy self.ensemble_method = ensemble_method - + self.search_iter = search_iter + self.cv_folds = cv_folds + self.subsample_ratio = subsample_ratio + self.resampling_strategy = 'auto' + self.random_state = 1001 + self.feature_add_high_cat = 0 + self.feature_add_cat_num = 10 + self.feature_add_cat_cat = 10 + + +class TabularEstimator(BaseEstimator): + def __init__(self, config=Config(), **kwargs): + """ + Initialization function for tabular supervised learner. + """ + self.config = config + # self.verbose = config.verbose + # self.path = config.path + # self.time_limit = config.time_limit + # self.objective = None + # abs_cwd = os.path.split(os.path.abspath(__file__))[0] + self.best_estimator_ = None + # self.use_ensembling = config.use_ensembling + self.hparams = None + # self.num_estimators_ensemble = config.num_estimators_ensemble + # self.ensemble_strategy = config.ensemble_strategy + # self.ensemble_method = config.ensemble_method + def fit(self, x, y): - if self.objective == 'classification': + if self.config.objective == 'classification': n_classes = len(set(y)) - self.objective = 'binary' if n_classes == 2 else 'multiclass' + self.config.objective = 'binary' if n_classes == 2 else 'multiclass' # x, y = self.resample(x, y) self.best_estimator_, _ = self.search(x, y) self.best_estimator_.fit(x, y) @@ -142,10 +168,9 @@ def predict_proba(self, x, y=None): except: y_pred = self.best_estimator_.predict(x, ) return y_pred - - @staticmethod - def resample(X, y): - return SMOTE(sampling_strategy='auto').fit_resample(X, y) + + def resample(self, X, y): + return SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(X, y) @staticmethod def subsample(x, y, sample_percent): @@ -160,9 +185,9 @@ def subsample(x, y, sample_percent): grid_train_x, grid_train_y = x[idx, :], y[idx] return grid_train_x, grid_train_y - def search(self, x, y, search_iter=100, folds=3, sample_percent=0.1): - grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=sample_percent) - score_metric, skf = self.get_skf(folds) + def search(self, x, y): + grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio) + score_metric, skf = self.get_skf(self.config.cv_folds) def objective_func(args): clf = args['model'](**args['param']) @@ -170,29 +195,33 @@ def objective_func(args): eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() except ValueError: eval_score = 0 - if self.verbose: + if self.config.verbose: print("CV Score:", eval_score) print("\n=================") return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} trials = Trials() - best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, max_evals=search_iter) - if self.use_ensembling: + best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, + max_evals=self.config.search_iter) + if self.config.use_ensembling: best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False) estimator_list = [] - for i in range(self.num_estimators_ensemble): + for i in range(self.config.num_estimators_ensemble): model_params = best_trials[i]['space'] est = model_params['model'](**model_params['param']) estimator_list.append(est) - if self.ensemble_strategy == 'ranked_ensembling': - best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.ensemble_method) - elif self.ensemble_strategy == 'stacking': - best_estimator_ = StackingEnsembler(estimator_list, objective=self.objective) + if self.config.ensemble_strategy == 'ranked_ensembling': + best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) + elif self.config.ensemble_strategy == 'stacking': + best_estimator_ = StackingEnsembler(estimator_list, objective=self.config.objective) else: - best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.ensemble_method) + best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) else: opt = space_eval(self.hparams, best) best_estimator_ = opt['model'](**opt['param']) + if self.config.verbose: + print("The best hyperparameter setting is:") + print(opt) return best_estimator_, trials @abstractmethod @@ -208,9 +237,9 @@ class Classifier(TabularEstimator): """Classifier class. It is used for tabular data classification. """ - def __init__(self, path=None, verbose=True, time_limit=None): - super().__init__(path, verbose, time_limit) - self.objective = 'classification' + def __init__(self, config=Config(), **kwargs): + super().__init__(config, **kwargs) + self.config.objective = 'classification' # TODO: add choice to the set of estimators self.hparams = hp.choice('classifier', [ {'model': KNeighborsClassifier, @@ -225,18 +254,18 @@ def __init__(self, path=None, verbose=True, time_limit=None): {'model': LGBMClassifier, 'param': lgbm_classifier_params }, - # {'model': AdaBoostClassifier, - # 'param': adaboost_classifier_params - # } + {'model': AdaBoostClassifier, + 'param': adaboost_classifier_params + } ]) def get_skf(self, folds): - if self.objective == 'binary': + if self.config.objective == 'binary': score_metric = 'roc_auc' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) else: score_metric = 'f1_weighted' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) return score_metric, skf @@ -244,9 +273,9 @@ class Regressor(TabularEstimator): """Regressor class. It is used for tabular data regression. """ - def __init__(self, path=None, verbose=True, time_limit=None): - super().__init__(path, verbose, time_limit) - self.objective = 'regression' + def __init__(self, config=Config(), **kwargs): + super().__init__(config, **kwargs) + self.config.objective = 'regression' # TODO: add choice to the set of estimators self.hparams = hp.choice('regressor', [ {'model': ExtraTreesRegressor, @@ -267,5 +296,4 @@ def __init__(self, path=None, verbose=True, time_limit=None): ]) def get_skf(self, folds): - return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001) - + return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index c42f7f6..baee70e 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -12,25 +12,20 @@ from abc import abstractmethod import collections from lightgbm import LGBMClassifier, LGBMRegressor +from autokaggle.estimators import Config LEVEL_HIGH = 32 class TabularPreprocessor: - def __init__(self, verbose=True): + def __init__(self, config): """ Initialization function for tabular preprocessor. """ - self.verbose = verbose - - self.feature_add_high_cat = 0 - self.feature_add_cat_num = 10 - self.feature_add_cat_cat = 10 - - self.budget = None self.data_info = None self.pipeline = None + self.config = config - def fit(self, raw_x, y, time_limit, data_info): + def fit(self, raw_x, y, data_info): """ This function should train the model parameters. @@ -43,19 +38,18 @@ def fit(self, raw_x, y, time_limit, data_info): 'TIME' for temporal feature, 'NUM' for other numerical feature, and 'CAT' for categorical feature. """ - self.budget = time_limit # Extract or read data info self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x) - data = TabularData(raw_x, self.data_info, self.verbose) + data = TabularData(raw_x, self.data_info, self.config.verbose) self.pipeline = Pipeline([ ('imputer', Imputation(selected_type='ALL', operation='upd')), # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')), # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')), # ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')), - ('count_encoder', CatCount(selected_type='CAT', operation='upd')), - # ('label_encoder', CatEncoder(selected_type='CAT', operation='add')), + # ('count_encoder', CatCount(selected_type='CAT', operation='upd')), + ('label_encoder', CatEncoder(selected_type='CAT', operation='upd')), # ('boxcox', BoxCox(selected_type='NUM', operation='upd')), # ('log_square', LogTransform(selected_type='NUM', operation='upd')), ('scaler', TabScaler(selected_type='NUM', operation='upd')), @@ -71,29 +65,20 @@ def fit(self, raw_x, y, time_limit, data_info): return self - def transform(self, raw_x, time_limit=None): + def transform(self, raw_x): """ This function should train the model parameters. Args: raw_x: a numpy.ndarray instance containing the training/testing data. - time_limit: remaining time budget. Both inputs X and y are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will NOT be available for re-training. """ # Get Meta-Feature - if time_limit is None: - if self.budget is None: - time_limit = 24 * 60 * 60 - self.budget = time_limit - else: - self.budget = time_limit - - data = TabularData(raw_x, self.data_info, self.verbose) + data = TabularData(raw_x, self.data_info, self.config.verbose) a = self.pipeline.transform(data).X - print(a.head()) return a.values @staticmethod @@ -308,25 +293,22 @@ def _transform(self, data, y=None): class CatEncoder(Primitive): cat_to_int_label = None + unknown_key_dict = None supported_ops = ('add', 'upd') def _fit(self, data, y=None): - X = data.X self.cat_to_int_label = {} - for col_index in self.selected: - self.cat_to_int_label[col_index] = self.cat_to_int_label.get(col_index, {}) - for row_index in range(len(X)): - key = str(X[row_index, col_index]) - if key not in self.cat_to_int_label[col_index]: - self.cat_to_int_label[col_index][key] = len(self.cat_to_int_label[col_index]) + self.unknown_key_dict = {} + for col in self.selected: + self.cat_to_int_label[col] = {key: idx for idx, key in enumerate(set(data.X[col]))} + self.unknown_key_dict[col] = len(self.cat_to_int_label[col]) return self def _transform(self, data, y=None): - X = data.X - for col_index in self.selected: - for row_index in range(len(X)): - key = str(X[row_index, col_index]) - X[row_index, col_index] = self.cat_to_int_label[col_index].get(key, np.nan) + x_tr = pd.DataFrame() + for col in self.selected: + x_tr[col] = data.X[col].apply(lambda key: self.cat_to_int_label[col].get(key, self.unknown_key_dict[col])) + data.update(self.operation, self.selected, x_tr, new_type='NUM') return data @@ -528,6 +510,7 @@ def _transform(self, data, y=None): class CatCount(Primitive): count_dict = None + unknown_key = 0 supported_ops = ('add', 'upd') def _fit(self, data, y=None): @@ -539,7 +522,7 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: - x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col][key]) + x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col].get(key, self.unknown_key)) data.update(self.operation, self.selected, x_tr, new_type='NUM') return data @@ -663,5 +646,4 @@ def _transform(self, data, y=None): x_new = prep.transform(x_train) print("-----") - print(x_new[:4, 2]) - + print(x_new[:4, 2]) \ No newline at end of file diff --git a/examples/benchmarking.py b/examples/benchmarking.py index b057257..60ceffa 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -285,6 +285,6 @@ def get_box_plot(data, task_id, metric): # ak.run_automation(classification_task_list) import time t1 = time.time() - ak.evaluate(2071) + ak.evaluate(31) t2 = time.time() print(t2-t1) diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index c338dd2..3b49846 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -18,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggle(Regressor()) + clf = AutoKaggle('regression') datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) From cdd2ba7e65b301f649b20ff35046e56f9f84bb65 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Tue, 17 Sep 2019 11:01:03 -0500 Subject: [PATCH 17/31] Split to classifier and regressor --- autokaggle/__init__.py | 2 +- autokaggle/auto_ml.py | 26 +++++-- autokaggle/ensemblers.py | 12 ++-- autokaggle/estimators.py | 51 ++++++++----- autokaggle/preprocessor.py | 71 ++++++++++++------- examples/benchmarking.py | 18 +++-- examples/tabular_classification_binary.py | 2 +- examples/tabular_classification_multiclass.py | 2 +- examples/tabular_regression.py | 2 +- 9 files changed, 125 insertions(+), 61 deletions(-) diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py index 0245d5b..1a975c9 100644 --- a/autokaggle/__init__.py +++ b/autokaggle/__init__.py @@ -1,3 +1,3 @@ -from autokaggle.auto_ml import AutoKaggle +from autokaggle.auto_ml import AutoKaggleClassifier, AutoKaggleRegressor from autokaggle.estimators import * from autokaggle.ensemblers import * diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index b726f5c..c2fc7c5 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -13,16 +13,18 @@ class AutoKaggle(BaseEstimator): - def __init__(self, objective='classification', config=Config()): + objective = None + model = None + + def __init__(self, config=Config(), **kwargs): """ Initialization function for tabular supervised learner. """ self.is_trained = False self.config = config - self.config.objective = objective + self.config.update(kwargs) + self.config.objective = self.objective self.preprocessor = TabularPreprocessor(config) - # TODO find elegant way of specifying classification or regression - self.model = Classifier(config) if objective == 'classification' else Regressor(config) def fit(self, x, y, time_limit=None, data_info=None): """ @@ -93,3 +95,19 @@ def evaluate(self, x_test, y_test): def final_fit(self, x_train, y_train): x_train = self.preprocessor.transform(x_train) self.model.fit(x_train, y_train) + + +class AutoKaggleClassifier(AutoKaggle): + objective = 'classification' + + def __init__(self, config=Config(), **kwargs): + super().__init__(config, **kwargs) + self.model = Classifier(config) + + +class AutoKaggleRegressor(AutoKaggle): + objective = 'regression' + + def __init__(self, config=Config(), **kwargs): + super().__init__(config, **kwargs) + self.model = Regressor(config) diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index 6f6fd5d..507fa6e 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -49,14 +49,14 @@ def __init__(self, estimator_list, objective): self.objective = objective if self.objective == 'regression': self.stacking_estimator = LGBMRegressor(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) + verbose=-1, + n_jobs=1, + objective=self.objective) elif self.objective == 'multiclass' or self.objective == 'binary': self.stacking_estimator = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) + verbose=-1, + n_jobs=1, + objective=self.objective) def fit(self, X, y): for est in self.estimator_list: diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 45b90b7..7a1d4a5 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -106,9 +106,9 @@ class Config(BaseEstimator): - def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=25, + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3, - subsample_ratio=0.1): + subsample_ratio=0.1, random_ensemble=False): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() ensure_dir(self.path) @@ -119,16 +119,20 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=Fals self.use_ensembling = use_ensembling self.hparams = None self.num_estimators_ensemble = num_estimators_ensemble + # self.ensemble_strategy = ensemble_strategy self.ensemble_strategy = ensemble_strategy self.ensemble_method = ensemble_method + self.random_ensemble = random_ensemble self.search_iter = search_iter self.cv_folds = cv_folds self.subsample_ratio = subsample_ratio self.resampling_strategy = 'auto' self.random_state = 1001 - self.feature_add_high_cat = 0 - self.feature_add_cat_num = 10 - self.feature_add_cat_cat = 10 + + def update(self, options): + for k, v in options.items(): + if hasattr(self, k): + setattr(self, k, v) class TabularEstimator(BaseEstimator): @@ -203,24 +207,14 @@ def objective_func(args): trials = Trials() best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, max_evals=self.config.search_iter) + if self.config.use_ensembling: - best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False) - estimator_list = [] - for i in range(self.config.num_estimators_ensemble): - model_params = best_trials[i]['space'] - est = model_params['model'](**model_params['param']) - estimator_list.append(est) - if self.config.ensemble_strategy == 'ranked_ensembling': - best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) - elif self.config.ensemble_strategy == 'stacking': - best_estimator_ = StackingEnsembler(estimator_list, objective=self.config.objective) - else: - best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) + best_estimator_ = self.setup_ensemble(trials) else: opt = space_eval(self.hparams, best) best_estimator_ = opt['model'](**opt['param']) if self.config.verbose: - print("The best hyperparameter setting is:") + print("The best hyperparameter setting found:") print(opt) return best_estimator_, trials @@ -231,6 +225,27 @@ def save_model(self): @abstractmethod def get_skf(self, folds): pass + + def setup_ensemble(self, trials): + best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False) + # Filter the unsuccessful hparam spaces i.e. 'loss' == 1 + best_trials = [t for t in best_trials if t['loss'] < 1] + self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials)) + if self.config.random_ensemble: + np.random.shuffle(best_trials) + estimator_list = [] + for i in range(self.config.num_estimators_ensemble): + model_params = best_trials[i]['space'] + est = model_params['model'](**model_params['param']) + estimator_list.append(est) + + if self.config.ensemble_strategy == 'ranked_ensembling': + best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) + elif self.config.ensemble_strategy == 'stacking': + best_estimator_ = StackingEnsembler(estimator_list, objective=self.config.objective) + else: + best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) + return best_estimator_ class Classifier(TabularEstimator): diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index baee70e..6bc0013 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -4,7 +4,7 @@ import itertools from scipy.stats import pearsonr from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer +from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer, OneHotEncoder from sklearn.base import TransformerMixin from sklearn.base import BaseEstimator from sklearn.impute import SimpleImputer @@ -47,12 +47,13 @@ def fit(self, raw_x, y, data_info): ('imputer', Imputation(selected_type='ALL', operation='upd')), # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')), # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')), - # ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')), + ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')), # ('count_encoder', CatCount(selected_type='CAT', operation='upd')), - ('label_encoder', CatEncoder(selected_type='CAT', operation='upd')), - # ('boxcox', BoxCox(selected_type='NUM', operation='upd')), - # ('log_square', LogTransform(selected_type='NUM', operation='upd')), + # ('one_hot_encoder', OneHot(selected_type='CAT', operation='upd')), + # ('label_encoder', LabelEncode(selected_type='CAT', operation='upd')), ('scaler', TabScaler(selected_type='NUM', operation='upd')), + # ('boxcox', BoxCox(selected_type='NUM', operation='upd')), + # ('log_transform', LogTransform(selected_type='NUM', operation='upd')), # ('binning', Binning(selected_type='NUM', operation='upd')), # ('pca', TabPCA(selected_type='NUM', operation='add')), # ('time_diff', TimeDiff(selected_type='TIME', operation='add')), @@ -142,6 +143,7 @@ def delete_type(self, columns): def rename_cols(self, key): def rename_fn(col_name): + col_name = str(col_name) col_name += '_' + key while col_name in self.X.columns: col_name += '_' + key @@ -201,11 +203,13 @@ class Primitive(BaseEstimator, TransformerMixin): drop_columns = None options = None supported_ops = ('add', 'upd', 'del') + name_key = '' def __init__(self, selected_type=None, operation='upd', **kwargs): self.selected_type = selected_type self.operation = operation self.init_vars(**kwargs) + self.name_key = self.__class__.__name__ def init_vars(self, **kwargs): self.options = kwargs @@ -250,7 +254,7 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = self.scaler.transform(data.X[self.selected]) - data.update(self.operation, self.selected, x_tr, new_type='NUM') + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -265,7 +269,7 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = self.transformer.transform(data.X[self.selected]) - data.update(self.operation, self.selected, x_tr, new_type='NUM') + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -287,11 +291,30 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = self.binner.transform(data.X[self.selected]) # TODO: decide if cat or num new type - data.update(self.operation, self.selected, x_tr, new_type='NUM') + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + return data + + +class OneHot(Primitive): + ohe = None + supported_ops = ('add', 'upd') + + def _fit(self, data, y=None): + self.ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') + self.ohe.fit(data.X[self.selected], y) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame(self.ohe.transform(data.X[self.selected])) + if self.operation == 'add': + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + elif self.operation == 'upd': + data.update('add', self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update('del', self.selected, None, None, key=self.name_key) return data -class CatEncoder(Primitive): +class LabelEncode(Primitive): cat_to_int_label = None unknown_key_dict = None supported_ops = ('add', 'upd') @@ -308,7 +331,7 @@ def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: x_tr[col] = data.X[col].apply(lambda key: self.cat_to_int_label[col].get(key, self.unknown_key_dict[col])) - data.update(self.operation, self.selected, x_tr, new_type='NUM') + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -343,7 +366,7 @@ def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: x_tr[col] = data.X[col].map(self.target_encoding_map[col][0], self.target_encoding_map[col][1]) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.__class__.__name__) + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -372,7 +395,7 @@ def _transform(self, data, y=None): if col1 + '_cross_' + col2 in self.cat_cat_map: x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2]) # TODO: decide new_type - data.update(self.operation, self.selected, x_tr, new_type='NUM') + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -414,7 +437,7 @@ def _transform(self, data, y=None): for col2 in self.selected_num: if col1 + '_cross_' + col2 in self.cat_num_map: x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_num_map[col1 + '_cross_' + col2]) - data.update(self.operation, self.selected, x_tr, new_type='NUM') + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -445,7 +468,7 @@ def _transform(self, data, y=None): for col2 in self.selected_bin: if col1 + '_cross_' + col2 in self.cat_bin_map: x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2]) - data.update(self.operation, self.selected, x_tr, new_type='NUM') + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -459,7 +482,7 @@ def _fit(self, data, y=None): return self def _transform(self, data, y=None): - data.update(self.operation, self.drop_columns, None, new_type=None) + data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key) return data @@ -473,7 +496,7 @@ def _transform(self, data, y=None): x_tr = pd.DataFrame() for a, b in itertools.combinations(self.selected, 2): x_tr[a + '-' + b] = data.X[a] - data.X[b] - data.update(self.operation, self.selected, x_tr, new_type='TIME') + data.update(self.operation, self.selected, x_tr, new_type='TIME', key=self.name_key) return data @@ -488,7 +511,7 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame() x_tr[self.selected] = data.X[self.selected] - self.start_time - data.update(self.operation, self.selected, x_tr, new_type='TIME') + data.update(self.operation, self.selected, x_tr, new_type='TIME', key=self.name_key) return data @@ -504,7 +527,7 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_pca = self.pca.transform(data.X[self.selected]) x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in range(x_pca.shape[1])]) - data.update(self.operation, self.selected, x_pca, new_type='NUM') + data.update(self.operation, self.selected, x_pca, new_type='NUM', key=self.name_key) return data @@ -523,7 +546,7 @@ def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col].get(key, self.unknown_key)) - data.update(self.operation, self.selected, x_tr, new_type='NUM') + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -537,8 +560,8 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: - x_tr[self.name_key + col] = np.square(np.log(1 + data.X[col])) - data.update(self.operation, self.selected, x_tr, new_type='NUM') + x_tr[self.name_key + col] = np.sign(data.X[col]) * np.log(1 + np.abs(data.X[col])) + data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data @@ -557,7 +580,7 @@ def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: x_tr[col] = data.X[col].fillna(self.impute_dict[col]) - data.update(self.operation, self.selected, x_tr, new_type=None) + data.update(self.operation, self.selected, x_tr, new_type=None, key=self.name_key) return data @@ -579,7 +602,7 @@ def _fit(self, data, y=None): return self def _transform(self, data, y=None): - data.update(self.operation, self.drop_columns, None, new_type=None) + data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key) return data @@ -620,7 +643,7 @@ def _fit(self, data, y=None): return self def _transform(self, data, y=None): - data.update(self.operation, self.drop_columns, None, new_type=None) + data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key) return data diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 60ceffa..c1f8a09 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -120,9 +120,9 @@ def evaluate(self, task_id, time_limit=10*60): # Train if task.task_type == 'Supervised Classification': - automl = AutoKaggle() if not self.estimator_type else AutoKaggle(self.estimator_type) + automl = AutoKaggleClassifier() elif task.task_type == 'Supervised Regression': - automl = AutoKaggle(Regressor()) if not self.estimator_type else AutoKaggle(self.estimator_type) + automl = AutoKaggleRegressor() else: print("UNSUPPORTED TASK_TYPE") assert(0) @@ -279,12 +279,20 @@ def get_box_plot(data, task_id, metric): if __name__ == "__main__": regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839] + # classification_task_list = [3021] classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101, 9914, 3020, 3524, 3573, 3962] - ak = BenchmarkingAutoKaggle() - # ak.run_automation(classification_task_list) + ak = BenchmarkingAutoKaggle(sess_name='test_perf') import time + t1 = time.time() - ak.evaluate(31) + for _ in range(3): + ak.run_automation(classification_task_list) + ak.export_results() t2 = time.time() print(t2-t1) + # import time + # t1 = time.time() + # ak.evaluate(31) + # t2 = time.time() + # print(t2-t1) diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py index 0ebd91f..fa8638d 100644 --- a/examples/tabular_classification_binary.py +++ b/examples/tabular_classification_binary.py @@ -19,7 +19,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggle() + clf = AutoKaggleClassifier() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py index 0da02fe..109d361 100644 --- a/examples/tabular_classification_multiclass.py +++ b/examples/tabular_classification_multiclass.py @@ -18,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggle() + clf = AutoKaggleClassifier() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index 3b49846..885160c 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -18,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggle('regression') + clf = AutoKaggleRegressor('regression') datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) From 516515b85cd23c3f642dad1ff9fa48276ff74e20 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Tue, 17 Sep 2019 12:56:32 -0500 Subject: [PATCH 18/31] Refactor Config --- autokaggle/auto_ml.py | 9 ++- autokaggle/config.py | 169 +++++++++++++++++++++++++++++++++++++++ autokaggle/estimators.py | 165 +------------------------------------- examples/benchmarking.py | 6 +- 4 files changed, 181 insertions(+), 168 deletions(-) create mode 100644 autokaggle/config.py diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index c2fc7c5..60bb7a1 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -12,9 +12,11 @@ from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +# TODO: Further clean the design of this file class AutoKaggle(BaseEstimator): objective = None model = None + preprocessor = None def __init__(self, config=Config(), **kwargs): """ @@ -24,6 +26,9 @@ def __init__(self, config=Config(), **kwargs): self.config = config self.config.update(kwargs) self.config.objective = self.objective + if not self.config.path: + self.config.path = rand_temp_folder_generator() + # abs_cwd = os.path.split(os.path.abspath(__file__))[0] self.preprocessor = TabularPreprocessor(config) def fit(self, x, y, time_limit=None, data_info=None): @@ -50,10 +55,6 @@ def fit(self, x, y, time_limit=None, data_info=None): while x.shape[0] < 60: x = np.concatenate([x, x], axis=0) y = np.concatenate([y, y], axis=0) - - # # Init model and preprocessor - # self.model = self.estimator_class(verbose=self.verbose, path=self.path, time_limit=self.time_limit) - # self.preprocessor = TabularPreprocessor() # Fit Model and preprocessor self.preprocessor.fit(x, y, data_info) diff --git a/autokaggle/config.py b/autokaggle/config.py new file mode 100644 index 0000000..edbc65c --- /dev/null +++ b/autokaggle/config.py @@ -0,0 +1,169 @@ +from sklearn.base import BaseEstimator +from autokaggle.utils import rand_temp_folder_generator, ensure_dir +from hyperopt import hp +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor,\ + ExtraTreesRegressor +from sklearn.linear_model import Ridge +from lightgbm import LGBMClassifier, LGBMRegressor + + +class Config(BaseEstimator): + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, + ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3, + subsample_ratio=0.1, random_ensemble=False): + self.verbose = verbose + self.path = path if path is not None else rand_temp_folder_generator() + ensure_dir(self.path) + if self.verbose: + print('Path:', self.path) + self.time_limit = time_limit + self.objective = None + self.use_ensembling = use_ensembling + self.hparams = None + self.num_estimators_ensemble = num_estimators_ensemble + self.ensemble_strategy = ensemble_strategy + self.ensemble_method = ensemble_method + self.random_ensemble = random_ensemble + self.search_iter = search_iter + self.cv_folds = cv_folds + self.subsample_ratio = subsample_ratio + self.resampling_strategy = 'auto' + self.random_state = 1001 + self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest', 'adaboost'] + self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost'] + + def update(self, options): + for k, v in options.items(): + if hasattr(self, k): + setattr(self, k, v) + + +knn_classifier_params = { + 'n_neighbors': hp.choice('n_neighbors', range(2, 20)), + 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), + 'leaf_size': hp.choice('leaf_size', range(5, 50)), + 'metric': hp.choice('metric', ["euclidean", "manhattan", "chebyshev", "minkowski"]), + 'p': hp.choice('p', range(1, 4)), +} + +svc_params = { + 'C': hp.lognormal('C', 0, 1), + 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), + 'degree': hp.choice('degree', range(1, 6)), + 'gamma': hp.uniform('gamma', 0.001, 10000), + 'max_iter': 50000, +} + +random_forest_classifier_params = { + 'criterion': hp.choice('criterion', ['entropy', 'gini']), + 'max_features': hp.uniform('max_features', 0, 1.0), + 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), + 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) +} + +lgbm_classifier_params = { + 'boosting_type': 'gbdt', + 'min_split_gain': 0.1, + 'subsample': 0.8, + 'num_leaves': 80, + 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), + 'max_depth': hp.choice('max_depth', range(5, 10)), + 'n_estimators': hp.choice('n_estimators', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate', 0, 1), +} + +adaboost_classifier_params = { + 'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), +} + +extra_trees_regressor_params = { + 'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)), + 'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']), + 'max_features': hp.uniform('max_features_extra_trees', 0, 1.0), + 'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)), + 'min_impurity_decrease': 0.0 +} + +ridge_params = { + 'fit_intercept': True, + 'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1), + 'alpha': hp.loguniform('alpha_ridge', 1e-5, 10) +} + +random_forest_regressor_params = { + 'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']), + 'max_features': hp.uniform('max_features', 0, 1.0), + 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), + 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) +} + +lgbm_regressor_params = { + 'boosting_type': 'gbdt', + 'min_split_gain': 0.1, + 'subsample': 0.8, + 'num_leaves': 80, + 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), + 'max_depth': hp.choice('max_depth', range(5, 10)), + 'n_estimators': hp.choice('n_estimators', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate', 0, 1), +} + +adaboost_regressor_params = { + 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), + 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), +} + + +regression_hspace = { + 'extratree': { + 'model': ExtraTreesRegressor, + 'param': extra_trees_regressor_params + }, + 'ridge': { + 'model': Ridge, + 'param': ridge_params + }, + 'random_forest': { + 'model': RandomForestRegressor, + 'param': random_forest_regressor_params + }, + 'lgbm': { + 'model': LGBMRegressor, + 'param': lgbm_regressor_params + }, + 'adaboost': { + 'model': AdaBoostRegressor, + 'param': adaboost_regressor_params + } +} + + +classification_hspace = { + 'knn': { + 'model': KNeighborsClassifier, + 'param': knn_classifier_params + }, + 'svm': { + 'model': SVC, + 'param': svc_params + }, + 'random_forest': { + 'model': RandomForestClassifier, + 'param': random_forest_classifier_params + }, + 'lgbm': { + 'model': LGBMClassifier, + 'param': lgbm_classifier_params + }, + 'adaboost': { + 'model': AdaBoostClassifier, + 'param': adaboost_classifier_params + } +} \ No newline at end of file diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 7a1d4a5..8cc3fef 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -5,153 +5,28 @@ import random import json -from lightgbm import LGBMClassifier, LGBMRegressor from sklearn.model_selection import RandomizedSearchCV, cross_val_score from sklearn.model_selection import StratifiedKFold, KFold -from sklearn.svm import SVC -from sklearn.neighbors import KNeighborsClassifier -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor,\ - ExtraTreesRegressor -from sklearn.linear_model import Ridge + from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer from joblib import dump, load from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler +from autokaggle.config import Config, classification_hspace, regression_hspace import hyperopt from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK from imblearn.over_sampling import SMOTE, SMOTENC -# TODO: Way to change the default hparams -knn_classifier_params = { - 'n_neighbors': hp.choice('n_neighbors', range(2, 20)), - 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), - 'leaf_size': hp.choice('leaf_size', range(5, 50)), - 'metric': hp.choice('metric', ["euclidean", "manhattan", "chebyshev", "minkowski"]), - 'p': hp.choice('p', range(1, 4)), -} - -svc_params = { - 'C': hp.lognormal('C', 0, 1), - 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), - 'degree': hp.choice('degree', range(1, 6)), - 'gamma': hp.uniform('gamma', 0.001, 10000), - 'max_iter': 50000, -} - -random_forest_classifier_params = { - 'criterion': hp.choice('criterion', ['entropy', 'gini']), - 'max_features': hp.uniform('max_features', 0, 1.0), - 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), - 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) -} - -lgbm_classifier_params = { - 'boosting_type': 'gbdt', - 'min_split_gain': 0.1, - 'subsample': 0.8, - 'num_leaves': 80, - 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), - 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), - 'max_depth': hp.choice('max_depth', range(5, 10)), - 'n_estimators': hp.choice('n_estimators', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate', 0, 1), -} - -adaboost_classifier_params = { - 'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), - 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), -} - -extra_trees_regressor_params = { - 'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)), - 'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']), - 'max_features': hp.uniform('max_features_extra_trees', 0, 1.0), - 'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)), - 'min_impurity_decrease': 0.0 -} - -ridge_params = { - 'fit_intercept': True, - 'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1), - 'alpha': hp.loguniform('alpha_ridge', 1e-5, 10) -} - -random_forest_regressor_params = { - 'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']), - 'max_features': hp.uniform('max_features', 0, 1.0), - 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), - 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) -} - -lgbm_regressor_params = { - 'boosting_type': 'gbdt', - 'min_split_gain': 0.1, - 'subsample': 0.8, - 'num_leaves': 80, - 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), - 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), - 'max_depth': hp.choice('max_depth', range(5, 10)), - 'n_estimators': hp.choice('n_estimators', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate', 0, 1), -} - -adaboost_regressor_params = { - 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), - 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), -} - - -class Config(BaseEstimator): - def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, - ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3, - subsample_ratio=0.1, random_ensemble=False): - self.verbose = verbose - self.path = path if path is not None else rand_temp_folder_generator() - ensure_dir(self.path) - if self.verbose: - print('Path:', path) - self.time_limit = time_limit - self.objective = None - self.use_ensembling = use_ensembling - self.hparams = None - self.num_estimators_ensemble = num_estimators_ensemble - # self.ensemble_strategy = ensemble_strategy - self.ensemble_strategy = ensemble_strategy - self.ensemble_method = ensemble_method - self.random_ensemble = random_ensemble - self.search_iter = search_iter - self.cv_folds = cv_folds - self.subsample_ratio = subsample_ratio - self.resampling_strategy = 'auto' - self.random_state = 1001 - - def update(self, options): - for k, v in options.items(): - if hasattr(self, k): - setattr(self, k, v) - - class TabularEstimator(BaseEstimator): def __init__(self, config=Config(), **kwargs): """ Initialization function for tabular supervised learner. """ self.config = config - # self.verbose = config.verbose - # self.path = config.path - # self.time_limit = config.time_limit - # self.objective = None - # abs_cwd = os.path.split(os.path.abspath(__file__))[0] self.best_estimator_ = None - # self.use_ensembling = config.use_ensembling self.hparams = None - # self.num_estimators_ensemble = config.num_estimators_ensemble - # self.ensemble_strategy = config.ensemble_strategy - # self.ensemble_method = config.ensemble_method def fit(self, x, y): if self.config.objective == 'classification': @@ -256,23 +131,7 @@ def __init__(self, config=Config(), **kwargs): super().__init__(config, **kwargs) self.config.objective = 'classification' # TODO: add choice to the set of estimators - self.hparams = hp.choice('classifier', [ - {'model': KNeighborsClassifier, - 'param': knn_classifier_params - }, - {'model': SVC, - 'param': svc_params - }, - {'model': RandomForestClassifier, - 'param': random_forest_classifier_params - }, - {'model': LGBMClassifier, - 'param': lgbm_classifier_params - }, - {'model': AdaBoostClassifier, - 'param': adaboost_classifier_params - } - ]) + self.hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models]) def get_skf(self, folds): if self.config.objective == 'binary': @@ -292,23 +151,7 @@ def __init__(self, config=Config(), **kwargs): super().__init__(config, **kwargs) self.config.objective = 'regression' # TODO: add choice to the set of estimators - self.hparams = hp.choice('regressor', [ - {'model': ExtraTreesRegressor, - 'param': extra_trees_regressor_params - }, - {'model': Ridge, - 'param': ridge_params - }, - {'model': RandomForestRegressor, - 'param': random_forest_regressor_params - }, - {'model': LGBMRegressor, - 'param': lgbm_regressor_params - }, - {'model': AdaBoostRegressor, - 'param': adaboost_regressor_params - } - ]) + self.hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models]) def get_skf(self, folds): return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) diff --git a/examples/benchmarking.py b/examples/benchmarking.py index c1f8a09..dec5f93 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -279,9 +279,9 @@ def get_box_plot(data, task_id, metric): if __name__ == "__main__": regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839] - # classification_task_list = [3021] - classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101, - 9914, 3020, 3524, 3573, 3962] + classification_task_list = [3021] + # classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101, + # 9914, 3020, 3524, 3573, 3962] ak = BenchmarkingAutoKaggle(sess_name='test_perf') import time From c400a2f58f3e44b1324e51eed504b586d743d5ea Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Tue, 17 Sep 2019 15:55:16 -0500 Subject: [PATCH 19/31] Fix config init related bug --- autokaggle/auto_ml.py | 14 +++++++------- autokaggle/config.py | 2 +- autokaggle/estimators.py | 14 ++++++++------ examples/benchmarking.py | 1 + 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 60bb7a1..9fc82c2 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -18,18 +18,18 @@ class AutoKaggle(BaseEstimator): model = None preprocessor = None - def __init__(self, config=Config(), **kwargs): + def __init__(self, config=None, **kwargs): """ Initialization function for tabular supervised learner. """ self.is_trained = False - self.config = config + self.config = config if config else Config() self.config.update(kwargs) self.config.objective = self.objective if not self.config.path: self.config.path = rand_temp_folder_generator() # abs_cwd = os.path.split(os.path.abspath(__file__))[0] - self.preprocessor = TabularPreprocessor(config) + self.preprocessor = TabularPreprocessor(self.config) def fit(self, x, y, time_limit=None, data_info=None): """ @@ -101,14 +101,14 @@ def final_fit(self, x_train, y_train): class AutoKaggleClassifier(AutoKaggle): objective = 'classification' - def __init__(self, config=Config(), **kwargs): + def __init__(self, config=None, **kwargs): super().__init__(config, **kwargs) - self.model = Classifier(config) + self.model = Classifier(self.config) class AutoKaggleRegressor(AutoKaggle): objective = 'regression' - def __init__(self, config=Config(), **kwargs): + def __init__(self, config=None, **kwargs): super().__init__(config, **kwargs) - self.model = Regressor(config) + self.model = Regressor(self.config) diff --git a/autokaggle/config.py b/autokaggle/config.py index edbc65c..e4d87aa 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -11,7 +11,7 @@ class Config(BaseEstimator): def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, - ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=500, cv_folds=3, + ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=5, cv_folds=3, subsample_ratio=0.1, random_ensemble=False): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 8cc3fef..214940b 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -20,11 +20,11 @@ class TabularEstimator(BaseEstimator): - def __init__(self, config=Config(), **kwargs): + def __init__(self, config=None, **kwargs): """ Initialization function for tabular supervised learner. """ - self.config = config + self.config = config if config else Config() self.best_estimator_ = None self.hparams = None @@ -68,7 +68,8 @@ def search(self, x, y): grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio) score_metric, skf = self.get_skf(self.config.cv_folds) - def objective_func(args): + def objective_func(params): + args = params['estimator'] clf = args['model'](**args['param']) try: eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() @@ -80,7 +81,8 @@ def objective_func(args): return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} trials = Trials() - best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, + search_space = {'source': hp.choice('data_source', ['a', 'b']), 'estimator': self.hparams} + best = fmin(objective_func, search_space, algo=hyperopt.rand.suggest, trials=trials, max_evals=self.config.search_iter) if self.config.use_ensembling: @@ -127,7 +129,7 @@ class Classifier(TabularEstimator): """Classifier class. It is used for tabular data classification. """ - def __init__(self, config=Config(), **kwargs): + def __init__(self, config=None, **kwargs): super().__init__(config, **kwargs) self.config.objective = 'classification' # TODO: add choice to the set of estimators @@ -147,7 +149,7 @@ class Regressor(TabularEstimator): """Regressor class. It is used for tabular data regression. """ - def __init__(self, config=Config(), **kwargs): + def __init__(self, config=None, **kwargs): super().__init__(config, **kwargs) self.config.objective = 'regression' # TODO: add choice to the set of estimators diff --git a/examples/benchmarking.py b/examples/benchmarking.py index dec5f93..750cbbb 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -63,6 +63,7 @@ def run_automation(self, task_list, time_limit=10*60): for task in task_list: try: self.evaluate(task, time_limit=time_limit) + self.export_results() except: print("task: {} didnt work".format(task)) From b35d9be2046ae807cfef52b081d74c3d7bc2ee21 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Wed, 18 Sep 2019 11:32:01 -0500 Subject: [PATCH 20/31] Diverse Ensembles --- autokaggle/config.py | 52 ++++++++++++++++++++++++++++++-------- autokaggle/estimators.py | 40 ++++++++++++++++++++++++----- autokaggle/preprocessor.py | 3 ++- examples/benchmarking.py | 9 +++---- 4 files changed, 80 insertions(+), 24 deletions(-) diff --git a/autokaggle/config.py b/autokaggle/config.py index e4d87aa..9b48062 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -7,12 +7,14 @@ ExtraTreesRegressor from sklearn.linear_model import Ridge from lightgbm import LGBMClassifier, LGBMRegressor +from catboost import CatBoostClassifier, Pool, CatBoostRegressor +import numpy as np class Config(BaseEstimator): def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, - ensemble_strategy='ranked_ensembling', ensemble_method='max_voting', search_iter=5, cv_folds=3, - subsample_ratio=0.1, random_ensemble=False): + ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3, + subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() ensure_dir(self.path) @@ -32,7 +34,9 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True self.resampling_strategy = 'auto' self.random_state = 1001 self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest', 'adaboost'] - self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost'] + # self.classification_models = ['knn', 'lgbm', 'random_forest',] + self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost', 'catboost'] + self.diverse_ensemble = diverse_ensemble def update(self, options): for k, v in options.items(): @@ -49,9 +53,9 @@ def update(self, options): } svc_params = { - 'C': hp.lognormal('C', 0, 1), + 'C': hp.loguniform('C', np.log(1e-5), np.log(1e5)), 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), - 'degree': hp.choice('degree', range(1, 6)), + 'degree': hp.choice('degree', range(1, 7)), 'gamma': hp.uniform('gamma', 0.001, 10000), 'max_iter': 50000, } @@ -72,13 +76,23 @@ def update(self, options): 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), 'max_depth': hp.choice('max_depth', range(5, 10)), 'n_estimators': hp.choice('n_estimators', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate', 0, 1), + 'learning_rate': hp.loguniform('learning_rate', low=np.log(1e-5), high=np.log(1)), } adaboost_classifier_params = { 'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), + 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-5), high=np.log(1)), +} + +catboost_classifier_params = { + 'iterations': hp.choice('catboost_iterations', [5, 10]), + 'depth': hp.choice('depth_catboost', range(4, 11)), + 'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), high=np.log(1)), + 'loss_function': hp.choice('loss_function_catboost', ['Logloss', 'CrossEntropy']), + 'verbose': True, + 'leaf_estimation_iterations': 10, + 'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3)) } extra_trees_regressor_params = { @@ -92,7 +106,7 @@ def update(self, options): ridge_params = { 'fit_intercept': True, 'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1), - 'alpha': hp.loguniform('alpha_ridge', 1e-5, 10) + 'alpha': hp.loguniform('alpha_ridge', np.log(1e-5), np.log(10)) } random_forest_regressor_params = { @@ -111,13 +125,21 @@ def update(self, options): 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), 'max_depth': hp.choice('max_depth', range(5, 10)), 'n_estimators': hp.choice('n_estimators', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate', 0, 1), + 'learning_rate': hp.loguniform('learning_rate', low=np.log(1e-5), high=np.log(1)), } adaboost_regressor_params = { 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), - 'learning_rate': hp.lognormal('learning_rate_adaboost', 0, 1), + 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-5), high=np.log(1)), +} + +catboost_regressor_params = { + 'iterations': 2, + 'depth': hp.choice('depth_catboost', range(4, 10)), + 'learning_rate': 1, + 'loss_function': 'RMSE', + 'verbose': True } @@ -141,7 +163,11 @@ def update(self, options): 'adaboost': { 'model': AdaBoostRegressor, 'param': adaboost_regressor_params - } + }, + 'catboost': { + 'model': CatBoostRegressor, + 'param': catboost_regressor_params + } } @@ -165,5 +191,9 @@ def update(self, options): 'adaboost': { 'model': AdaBoostClassifier, 'param': adaboost_classifier_params + }, + 'catboost': { + 'model': CatBoostClassifier, + 'param': catboost_classifier_params } } \ No newline at end of file diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 214940b..71034d4 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -1,3 +1,5 @@ +import collections + from sklearn.base import BaseEstimator from abc import abstractmethod import numpy as np @@ -103,18 +105,42 @@ def save_model(self): def get_skf(self, folds): pass + @staticmethod + def pick_diverse_estimators(trial_list, k): + groups = collections.defaultdict(list) + + for obj in trial_list: + groups[obj['space']['model']].append(obj) + estimator_list = [] + idx, j = 0, 0 + while idx < k: + for grp in groups.values(): + if j < len(grp): + model_params = grp[j]['space'] + est = model_params['model'](**model_params['param']) + estimator_list.append(est) + idx += 1 + j += 1 + return estimator_list + def setup_ensemble(self, trials): - best_trials = sorted(trials.results, key=lambda k: k['loss'], reverse=False) # Filter the unsuccessful hparam spaces i.e. 'loss' == 1 - best_trials = [t for t in best_trials if t['loss'] < 1] + best_trials = [t for t in trials.results if t['loss'] < 1] + best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) + self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials)) + if self.config.random_ensemble: np.random.shuffle(best_trials) - estimator_list = [] - for i in range(self.config.num_estimators_ensemble): - model_params = best_trials[i]['space'] - est = model_params['model'](**model_params['param']) - estimator_list.append(est) + + if self.config.diverse_ensemble: + estimator_list = self.pick_diverse_estimators(best_trials, self.config.num_estimators_ensemble) + else: + estimator_list = [] + for i in range(self.config.num_estimators_ensemble): + model_params = best_trials[i]['space'] + est = model_params['model'](**model_params['param']) + estimator_list.append(est) if self.config.ensemble_strategy == 'ranked_ensembling': best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 6bc0013..5f3423c 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -125,7 +125,7 @@ def __init__(self, raw_x, data_info, verbose=True): raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) self.X = raw_x - self.update_cat_cardinality() + # self.update_cat_cardinality() if self.verbose: print('DATA_INFO: {}'.format(self.data_info)) @@ -178,6 +178,7 @@ def refresh_col_types(self): self.n_cat = len(self.cat_col) def update_cat_cardinality(self): + # TODO: too slow make it faster if not self.cat_cardinality: self.cat_cardinality = {} for c in self.cat_col: diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 750cbbb..ad067d3 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -280,20 +280,19 @@ def get_box_plot(data, task_id, metric): if __name__ == "__main__": regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839] - classification_task_list = [3021] - # classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101, - # 9914, 3020, 3524, 3573, 3962] + # classification_task_list = [3021] + classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101, + 9914, 3020, 3524, 3573, 3962] ak = BenchmarkingAutoKaggle(sess_name='test_perf') import time t1 = time.time() for _ in range(3): ak.run_automation(classification_task_list) - ak.export_results() t2 = time.time() print(t2-t1) # import time # t1 = time.time() - # ak.evaluate(31) + # ak.evaluate(3945) # t2 = time.time() # print(t2-t1) From 1731d77ba7caf783748ffc3a6e1ba3efb148adb1 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Wed, 18 Sep 2019 15:45:04 -0500 Subject: [PATCH 21/31] CV for stacking and proba stacking --- autokaggle/config.py | 5 +- autokaggle/ensemblers.py | 114 +++++++++++++++++++++++++++++++-------- autokaggle/estimators.py | 6 +-- examples/benchmarking.py | 14 ++--- 4 files changed, 104 insertions(+), 35 deletions(-) diff --git a/autokaggle/config.py b/autokaggle/config.py index 9b48062..b1bf225 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -13,8 +13,8 @@ class Config(BaseEstimator): def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, - ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3, - subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True): + ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3, + subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() ensure_dir(self.path) @@ -37,6 +37,7 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True # self.classification_models = ['knn', 'lgbm', 'random_forest',] self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost', 'catboost'] self.diverse_ensemble = diverse_ensemble + self.stack_probabilities = stack_probabilities def update(self, options): for k, v in options.items(): diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index 507fa6e..97b82d6 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -15,11 +15,27 @@ from lightgbm import LGBMClassifier, LGBMRegressor import collections from sklearn.model_selection import RandomizedSearchCV, cross_val_score +import hyperopt +from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK +from autokaggle.config import classification_hspace, regression_hspace + + +lgbm_classifier_params = { + 'n_estimators': hp.choice('n_estimators', [100, 150, 200]), +} + +_classification_hspace = { + 'lgbm': { + 'model': LGBMClassifier, + 'param': lgbm_classifier_params + }, +} class RankedEnsembler: - def __init__(self, estimator_list, ensemble_method='max_voting'): - self.ensemble_method = ensemble_method + def __init__(self, estimator_list, config): + self.config = config + self.ensemble_method = config.ensemble_method self.estimators = estimator_list def fit(self, X, y): @@ -44,44 +60,96 @@ def predict(self, X): class StackingEnsembler: - def __init__(self, estimator_list, objective): + stacking_estimator = None + + def __init__(self, estimator_list, config): + self.config = config self.estimator_list = estimator_list - self.objective = objective - if self.objective == 'regression': - self.stacking_estimator = LGBMRegressor(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) - elif self.objective == 'multiclass' or self.objective == 'binary': - self.stacking_estimator = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) + self.objective = config.objective + if self.config.objective == 'regression': + self.hparams = hp.choice('regressor', [regression_hspace[m] for m in ['lgbm']]) + self.config.stack_probabilities = False + else: + self.hparams = hp.choice('classifier', [_classification_hspace[m] for m in ['lgbm']]) + + def get_model_predictions(self, X): + if self.config.stack_probabilities: + predictions = np.zeros((len(X), 1)) + for i, est in enumerate(self.estimator_list): + try: + new = est.predict_proba(X)[:, :-1] + predictions = np.hstack([predictions, new]) + except AttributeError: + new = np.reshape(est.predict(X), (-1, 1)) + predictions = np.hstack([predictions, new]) + predictions = predictions[:, 1:] + else: + predictions = np.zeros((len(X), len(self.estimator_list))) + for i, est in enumerate(self.estimator_list): + predictions[:, i] = est.predict(X) + return predictions def fit(self, X, y): for est in self.estimator_list: est.fit(X, y) - predictions = np.zeros((len(X), len(self.estimator_list))) - for i, est in enumerate(self.estimator_list): - predictions[:, i] = est.predict(X) + predictions = self.get_model_predictions(X) + self.stacking_estimator = self.search(predictions, y) self.stacking_estimator.fit(predictions, y) + def search(self, x, y): + score_metric, skf = self.get_skf(self.config.cv_folds) + + def objective_func(args): + clf = args['model'](**args['param']) + try: + eval_score = cross_val_score(clf, x, y, scoring=score_metric, cv=skf).mean() + except ValueError: + eval_score = 0 + if self.config.verbose: + print("Ensembling CV Score:", eval_score) + print("\n=================") + return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} + + trials = Trials() + best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, + max_evals=50) + + opt = space_eval(self.hparams, best) + best_estimator_ = opt['model'](**opt['param']) + if self.config.verbose: + print("The best hyperparameter setting found for stacking:") + print(opt) + return best_estimator_ + def predict(self, X): - predictions = np.zeros((len(X), len(self.estimator_list))) - for i, est in enumerate(self.estimator_list): - predictions[:, i] = est.predict(X) + predictions = self.get_model_predictions(X) return self.stacking_estimator.predict(predictions) + def get_skf(self, folds): + if self.config.objective == 'binary': + score_metric = 'roc_auc' + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + elif self.config.objective == 'multiclass': + score_metric = 'f1_weighted' + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + elif self.config.objective == 'regression': + score_metric = 'neg_mean_squared_error' + skf = KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + else: + ValueError("Invalid objective") + return score_metric, skf + class EnsembleSelection: indices_ = None weights_ = None - def __init__(self, estimator_list, objective, ensemble_size=25): + def __init__(self, estimator_list, config): self.estimator_list = estimator_list - self.objective = objective + self.config = config + self.objective = config.objective self.indices_, self.weights_ = [], [] - self.ensemble_size = min(len(estimator_list), ensemble_size) + self.ensemble_size = len(estimator_list) if self.objective == 'regression': self.score_metric = 'neg_mean_squared_error' self.skf = KFold(n_splits=3, shuffle=True, random_state=1001) diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py index 71034d4..272ab5f 100644 --- a/autokaggle/estimators.py +++ b/autokaggle/estimators.py @@ -143,11 +143,11 @@ def setup_ensemble(self, trials): estimator_list.append(est) if self.config.ensemble_strategy == 'ranked_ensembling': - best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) + best_estimator_ = RankedEnsembler(estimator_list, config=self.config) elif self.config.ensemble_strategy == 'stacking': - best_estimator_ = StackingEnsembler(estimator_list, objective=self.config.objective) + best_estimator_ = StackingEnsembler(estimator_list, config=self.config) else: - best_estimator_ = RankedEnsembler(estimator_list, ensemble_method=self.config.ensemble_method) + best_estimator_ = RankedEnsembler(estimator_list, config=self.config) return best_estimator_ diff --git a/examples/benchmarking.py b/examples/benchmarking.py index ad067d3..50dab23 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -286,13 +286,13 @@ def get_box_plot(data, task_id, metric): ak = BenchmarkingAutoKaggle(sess_name='test_perf') import time - t1 = time.time() - for _ in range(3): - ak.run_automation(classification_task_list) - t2 = time.time() - print(t2-t1) - # import time # t1 = time.time() - # ak.evaluate(3945) + # for _ in range(3): + # ak.run_automation(classification_task_list) # t2 = time.time() # print(t2-t1) + import time + t1 = time.time() + ak.evaluate(31) + t2 = time.time() + print(t2-t1) From e614bfde8c43a16a303df691b9bdd4cf0ac12f4d Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 19 Sep 2019 11:13:12 -0500 Subject: [PATCH 22/31] hparam update --- autokaggle/config.py | 75 +++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/autokaggle/config.py b/autokaggle/config.py index b1bf225..985938a 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -46,26 +46,26 @@ def update(self, options): knn_classifier_params = { - 'n_neighbors': hp.choice('n_neighbors', range(2, 20)), - 'algorithm': hp.choice('algorithm', ['ball_tree', 'kd_tree', 'brute']), - 'leaf_size': hp.choice('leaf_size', range(5, 50)), - 'metric': hp.choice('metric', ["euclidean", "manhattan", "chebyshev", "minkowski"]), - 'p': hp.choice('p', range(1, 4)), + 'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]), + 'weights': hp.choice('weight_knn', ['uniform', 'distance']), + 'metric': hp.choice('metric_knn', ["euclidean", "manhattan", "chebyshev", "minkowski"]), + 'p': hp.choice('p_knn', range(1, 3)), } svc_params = { - 'C': hp.loguniform('C', np.log(1e-5), np.log(1e5)), - 'kernel': hp.choice('kernel', ['rbf', 'poly', 'linear', 'sigmoid']), - 'degree': hp.choice('degree', range(1, 7)), - 'gamma': hp.uniform('gamma', 0.001, 10000), + 'C': hp.loguniform('C_svm', np.log(0.03125), np.log(32768)), + 'kernel': hp.choice('kernel_svm', ['rbf', 'poly', 'sigmoid']), + 'degree': hp.choice('degree_svm', range(2, 6)), + 'gamma': hp.loguniform('gamma_svm', np.log(3e-5), np.log(8)), 'max_iter': 50000, } random_forest_classifier_params = { - 'criterion': hp.choice('criterion', ['entropy', 'gini']), - 'max_features': hp.uniform('max_features', 0, 1.0), - 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), - 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) + 'criterion': hp.choice('criterion_rf', ['entropy', 'gini']), + 'max_features': hp.uniform('max_features_rf', 0, 1.0), + 'n_estimators': hp.choice('n_estimators_rf', [100, 50]), + 'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 20)), + 'min_samples_split': hp.choice('min_samples_split_rf', range(2, 20)), } lgbm_classifier_params = { @@ -73,21 +73,21 @@ def update(self, options): 'min_split_gain': 0.1, 'subsample': 0.8, 'num_leaves': 80, - 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), - 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), - 'max_depth': hp.choice('max_depth', range(5, 10)), - 'n_estimators': hp.choice('n_estimators', range(50, 200)), - 'learning_rate': hp.loguniform('learning_rate', low=np.log(1e-5), high=np.log(1)), + 'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)), + 'max_depth': hp.choice('max_depth_lgbm', range(5, 10)), + 'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)), + 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2), high=np.log(2)), } adaboost_classifier_params = { 'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), - 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), - 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-5), high=np.log(1)), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)), + 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)), } catboost_classifier_params = { - 'iterations': hp.choice('catboost_iterations', [5, 10]), + 'iterations': hp.choice('iterations_catboost', [5, 10]), 'depth': hp.choice('depth_catboost', range(4, 11)), 'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), high=np.log(1)), 'loss_function': hp.choice('loss_function_catboost', ['Logloss', 'CrossEntropy']), @@ -97,11 +97,13 @@ def update(self, options): } extra_trees_regressor_params = { - 'n_estimators': hp.choice('n_estimators_extra_trees', range(50, 200)), + 'n_estimators': hp.choice('n_estimators_extra_trees', [50, 100, 200]), 'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']), 'max_features': hp.uniform('max_features_extra_trees', 0, 1.0), - 'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 10)), - 'min_impurity_decrease': 0.0 + 'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 20)), + 'min_samples_split': hp.choice('min_samples_split_extra_trees', range(2, 20)), + 'min_impurity_decrease': 0.0, + 'bootstrap': hp.choice('bootstrap_extra_trees', [True, False]), } ridge_params = { @@ -111,10 +113,12 @@ def update(self, options): } random_forest_regressor_params = { - 'criterion': hp.choice('criterion', ['mse', 'friedman_mse', 'mae']), - 'max_features': hp.uniform('max_features', 0, 1.0), - 'n_estimators': hp.choice('rf_n_estimators', range(50, 200)), - 'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)) + 'criterion': hp.choice('criterion_rf', ['mse', 'friedman_mse', 'mae']), + 'max_features': hp.uniform('max_features_rf', 0.1, 1.0), + 'n_estimators': hp.choice('n_estimators_rf', [50, 100, 200]), + 'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 10)), + 'min_samples_split': hp.choice('min_samples_split_rf', range(2, 10)), + 'bootstrap': hp.choice('bootstrap_rf', [True, False]), } lgbm_regressor_params = { @@ -122,17 +126,18 @@ def update(self, options): 'min_split_gain': 0.1, 'subsample': 0.8, 'num_leaves': 80, - 'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.8), - 'min_child_weight': hp.choice('min_child_weight', range(1, 100)), - 'max_depth': hp.choice('max_depth', range(5, 10)), - 'n_estimators': hp.choice('n_estimators', range(50, 200)), - 'learning_rate': hp.loguniform('learning_rate', low=np.log(1e-5), high=np.log(1)), + 'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)), + 'max_depth': hp.choice('max_depth_lgbm', range(5, 10)), + 'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)), + 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5), high=np.log(1)), } adaboost_regressor_params = { 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), - 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 200)), - 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-5), high=np.log(1)), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)), + 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)), + # 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)), } catboost_regressor_params = { From 951e1d10399580f6f9e0a432c82f791c920468b2 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 19 Sep 2019 12:14:12 -0500 Subject: [PATCH 23/31] Add blind dataset in stacking --- autokaggle/ensemblers.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index 97b82d6..68cafa7 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -7,7 +7,7 @@ import json from statistics import mode -from sklearn.model_selection import RandomizedSearchCV +from sklearn.model_selection import RandomizedSearchCV, train_test_split from sklearn.model_selection import StratifiedKFold, KFold from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error from joblib import dump, load @@ -90,11 +90,12 @@ def get_model_predictions(self, X): return predictions def fit(self, X, y): + x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2) for est in self.estimator_list: - est.fit(X, y) - predictions = self.get_model_predictions(X) - self.stacking_estimator = self.search(predictions, y) - self.stacking_estimator.fit(predictions, y) + est.fit(x_train, y_train) + predictions = self.get_model_predictions(x_val) + self.stacking_estimator = self.search(predictions, y_val) + self.stacking_estimator.fit(predictions, y_val) def search(self, x, y): score_metric, skf = self.get_skf(self.config.cv_folds) From 5aed1271bdff01214b030db68a681fc872b5f38b Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 19 Sep 2019 20:45:45 -0500 Subject: [PATCH 24/31] Refactor with AutoPipe --- autokaggle/auto_ml.py | 252 ++++++++++++++++++++++++++++----- autokaggle/config.py | 11 +- autokaggle/estimators.py | 185 ------------------------ autokaggle/preprocessor.py | 37 +---- examples/tabular_regression.py | 2 +- 5 files changed, 231 insertions(+), 256 deletions(-) delete mode 100644 autokaggle/estimators.py diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 9fc82c2..4707212 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -1,4 +1,4 @@ -from sklearn.base import BaseEstimator +from sklearn.base import BaseEstimator, is_classifier from abc import abstractmethod import numpy as np import os @@ -8,15 +8,22 @@ from joblib import dump, load from autokaggle.preprocessor import TabularPreprocessor -from autokaggle.estimators import * from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +from lightgbm import LGBMClassifier, LGBMRegressor +from autokaggle.config import Config, classification_hspace, regression_hspace +from sklearn.model_selection import StratifiedKFold, KFold +import hyperopt +from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL +from sklearn.model_selection import cross_val_score +from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler +from imblearn.over_sampling import SMOTE, SMOTENC +import collections # TODO: Further clean the design of this file class AutoKaggle(BaseEstimator): - objective = None - model = None - preprocessor = None + pipeline = None + hparams = None def __init__(self, config=None, **kwargs): """ @@ -25,11 +32,8 @@ def __init__(self, config=None, **kwargs): self.is_trained = False self.config = config if config else Config() self.config.update(kwargs) - self.config.objective = self.objective if not self.config.path: self.config.path = rand_temp_folder_generator() - # abs_cwd = os.path.split(os.path.abspath(__file__))[0] - self.preprocessor = TabularPreprocessor(self.config) def fit(self, x, y, time_limit=None, data_info=None): """ @@ -48,34 +52,43 @@ def fit(self, x, y, time_limit=None, data_info=None): NOT be available for re-training. """ self.config.time_limit = time_limit if time_limit else 24 * 60 * 60 + + # Extract or read data info + self.config.data_info = data_info if data_info is not None else self.extract_data_info(x) + + if self.config.verbose: + print('DATA_INFO: {}'.format(self.config.data_info)) + print('#TIME features: {}'.format(sum(self.config.data_info == 'TIME'))) + print('#NUM features: {}'.format(sum(self.config.data_info == 'NUM'))) + print('#CAT features: {}'.format(sum(self.config.data_info == 'CAT'))) if x.shape[1] == 0: raise ValueError("No feature exist!") - while x.shape[0] < 60: - x = np.concatenate([x, x], axis=0) - y = np.concatenate([y, y], axis=0) - - # Fit Model and preprocessor - self.preprocessor.fit(x, y, data_info) - x = self.preprocessor.transform(x) - self.model.fit(x, y) - self.is_trained = True + x, y = self.resample(x, y) - if self.config.verbose: - print("The whole available data is: ") - print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1])) + if self.config.objective == 'classification': + n_classes = len(set(y)) + self.config.objective = 'binary' if n_classes == 2 else 'multiclass' + + # self.pipeline = AutoPipe(LGBMClassifier, {}, {}, self.config) + prep_space = {'prep': hp.choice('data_source', ['a', 'b'])} + self.pipeline = self.get_best_pipeline(self.search(x, y, prep_space, self.hparams)) + self.pipeline.fit(x, y) + self.is_trained = True - def predict(self, x_test, predict_proba=False): + def predict(self, x_test): """ This function should provide predictions of labels on (test) data. The function predict eventually can return probabilities or continuous values. """ - x_test = self.preprocessor.transform(x_test) - if predict_proba: - y = self.model.predict_proba(x_test, ) - else: - y = self.model.predict(x_test, ) + y = self.pipeline.predict(x_test) + if y is None: + raise ValueError("Tabular predictor does not exist") + return y + + def predict_proba(self, x_test): + y = self.pipeline.predict_proba(x_test) if y is None: raise ValueError("Tabular predictor does not exist") return y @@ -94,21 +107,192 @@ def evaluate(self, x_test, y_test): return results def final_fit(self, x_train, y_train): - x_train = self.preprocessor.transform(x_train) - self.model.fit(x_train, y_train) + self.pipeline.fit(x_train, y_train) + def resample(self, x, y): + if self.config.balance_class_dist: + x, y = SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(x, y) + while x.shape[0] < 60: + x = np.concatenate([x, x], axis=0) + y = np.concatenate([y, y], axis=0) + return x, y -class AutoKaggleClassifier(AutoKaggle): - objective = 'classification' + def subsample(self, x, y, sample_percent): + # TODO: Add way to balance the subsample + # Set small sample for hyper-param search + if x.shape[0] > 600: + grid_train_percentage = max(600.0 / x.shape[0], sample_percent) + else: + grid_train_percentage = 1 + grid_n = int(x.shape[0] * grid_train_percentage) + idx = random.sample(list(range(x.shape[0])), grid_n) + grid_train_x, grid_train_y = x[idx, :], y[idx] + return grid_train_x, grid_train_y + + def search(self, x, y, prep_space, model_space): + grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio) + score_metric, skf = self.get_skf(self.config.cv_folds) + + def objective_func(params): + model_class = params['estimator']['model'] + m_params = params['estimator']['param'] + p_params = params['prep'] + pipeline = AutoPipe(model_class=model_class, m_params=m_params, p_params=p_params, config=self.config) + try: + eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() + status = STATUS_OK + except ValueError: + eval_score = float('-inf') + status = STATUS_FAIL + if self.config.verbose: + print("CV Score:", eval_score) + print("\n=================") + loss = 1 - eval_score if status == STATUS_OK else float('inf') + return {'loss': loss, 'status': status, 'model_class': model_class, 'm_params': m_params, + 'p_params': p_params} + + trials = Trials() + search_space = {'prep': prep_space, 'estimator': model_space} + _ = fmin(objective_func, search_space, algo=hyperopt.rand.suggest, trials=trials, + max_evals=self.config.search_iter) + return trials + + def get_best_pipeline(self, trials): + if self.config.use_ensembling: + best_pipeline = self.setup_ensemble(trials) + else: + opt = trials.best_trial['result'] + best_pipeline = AutoPipe(opt['model_class'], opt['m_params'], opt['p_params'], self.config) + if self.config.verbose: + print("The best hyperparameter setting found:") + print(opt) + return best_pipeline + + @abstractmethod + def get_skf(self, folds): + pass + + def pick_diverse_estimators(self, trial_list, k): + groups = collections.defaultdict(list) + for obj in trial_list: + groups[obj['model_class']].append(obj) + estimator_list = [] + idx, j = 0, 0 + while idx < k: + for grp in groups.values(): + if j < len(grp): + est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'], grp[j]['p_params'], self.config) + estimator_list.append(est) + idx += 1 + j += 1 + return estimator_list + + def setup_ensemble(self, trials): + # Filter the unsuccessful hparam spaces i.e. 'loss' == float('inf') + best_trials = [t for t in trials.results if t['loss'] != float('inf')] + best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) + + self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials)) + + if self.config.random_ensemble: + np.random.shuffle(best_trials) + + if self.config.diverse_ensemble: + estimator_list = self.pick_diverse_estimators(best_trials, self.config.num_estimators_ensemble) + else: + estimator_list = [] + for i in range(self.config.num_estimators_ensemble): + est = AutoPipe(best_trials[i]['model_class'], best_trials[i]['m_params'], best_trials[i]['p_params'], + self.config) + estimator_list.append(est) + + if self.config.ensemble_strategy == 'stacking': + best_estimator_ = StackingEnsembler(estimator_list, config=self.config) + else: + best_estimator_ = RankedEnsembler(estimator_list, config=self.config) + return best_estimator_ + + @staticmethod + def extract_data_info(raw_x): + """ + This function extracts the data info automatically based on the type of each feature in raw_x. + + Args: + raw_x: a numpy.ndarray instance containing the training data. + """ + data_info = [] + row_num, col_num = raw_x.shape + for col_idx in range(col_num): + try: + raw_x[:, col_idx].astype(np.float) + data_info.append('NUM') + except: + data_info.append('CAT') + return np.array(data_info) + + +class AutoKaggleClassifier(AutoKaggle): def __init__(self, config=None, **kwargs): super().__init__(config, **kwargs) - self.model = Classifier(self.config) + self.config.objective = 'classification' + self.hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models]) + def get_skf(self, folds): + if self.config.objective == 'binary': + score_metric = 'roc_auc' + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + else: + score_metric = 'f1_weighted' + skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + return score_metric, skf -class AutoKaggleRegressor(AutoKaggle): - objective = 'regression' +class AutoKaggleRegressor(AutoKaggle): def __init__(self, config=None, **kwargs): super().__init__(config, **kwargs) - self.model = Regressor(self.config) + self.config.objective = 'regression' + self.hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models]) + + def get_skf(self, folds): + return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + + +class AutoPipe(BaseEstimator): + prep = None + model = None + config = None + m_params = None + p_params = None + model_class = None + + def __init__(self, model_class, m_params, p_params, config): + self.config = config + self.m_params = m_params + self.p_params = p_params + self.model_class = model_class + self._estimator_type = 'classifier' if is_classifier(model_class) else 'regressor' + + def fit(self, x, y): + self.prep = TabularPreprocessor(self.config) + self.model = self.model_class(**self.m_params) + x = self.prep.fit_transform(x, y) + self.model.fit(x, y) + + def predict(self, x): + x = self.prep.transform(x) + return self.model.predict(x) + + def predict_proba(self, x): + x = self.prep.transform(x) + try: + return self.model.predict_proba(x) + except AttributeError: + return self.model.predict(x) + + def decision_function(self, x): + x = self.prep.transform(x) + try: + return self.model.decision_function(x) + except AttributeError: + raise AttributeError diff --git a/autokaggle/config.py b/autokaggle/config.py index 985938a..d9be2ef 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -11,10 +11,11 @@ import numpy as np -class Config(BaseEstimator): - def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, - ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3, - subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False): +class Config: + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=5, + ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3, + subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False, + data_info=None, balance_class_dist=False): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() ensure_dir(self.path) @@ -38,6 +39,8 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost', 'catboost'] self.diverse_ensemble = diverse_ensemble self.stack_probabilities = stack_probabilities + self.data_info = data_info + self.balance_class_dist = balance_class_dist def update(self, options): for k, v in options.items(): diff --git a/autokaggle/estimators.py b/autokaggle/estimators.py deleted file mode 100644 index 272ab5f..0000000 --- a/autokaggle/estimators.py +++ /dev/null @@ -1,185 +0,0 @@ -import collections - -from sklearn.base import BaseEstimator -from abc import abstractmethod -import numpy as np -import os -import random -import json - -from sklearn.model_selection import RandomizedSearchCV, cross_val_score -from sklearn.model_selection import StratifiedKFold, KFold - -from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error, make_scorer -from joblib import dump, load - -from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json -from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler -from autokaggle.config import Config, classification_hspace, regression_hspace -import hyperopt -from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK -from imblearn.over_sampling import SMOTE, SMOTENC - - -class TabularEstimator(BaseEstimator): - def __init__(self, config=None, **kwargs): - """ - Initialization function for tabular supervised learner. - """ - self.config = config if config else Config() - self.best_estimator_ = None - self.hparams = None - - def fit(self, x, y): - if self.config.objective == 'classification': - n_classes = len(set(y)) - self.config.objective = 'binary' if n_classes == 2 else 'multiclass' - # x, y = self.resample(x, y) - self.best_estimator_, _ = self.search(x, y) - self.best_estimator_.fit(x, y) - self.save_model() - - def predict(self, x, y=None): - y_pred = self.best_estimator_.predict(x, ) - return y_pred - - def predict_proba(self, x, y=None): - try: - y_pred = self.best_estimator_.predict_proba(x, ) - except: - y_pred = self.best_estimator_.predict(x, ) - return y_pred - - def resample(self, X, y): - return SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(X, y) - - @staticmethod - def subsample(x, y, sample_percent): - # TODO: Add way to balance the subsample - # Set small sample for hyper-param search - if x.shape[0] > 600: - grid_train_percentage = max(600.0 / x.shape[0], sample_percent) - else: - grid_train_percentage = 1 - grid_n = int(x.shape[0] * grid_train_percentage) - idx = random.sample(list(range(x.shape[0])), grid_n) - grid_train_x, grid_train_y = x[idx, :], y[idx] - return grid_train_x, grid_train_y - - def search(self, x, y): - grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio) - score_metric, skf = self.get_skf(self.config.cv_folds) - - def objective_func(params): - args = params['estimator'] - clf = args['model'](**args['param']) - try: - eval_score = cross_val_score(clf, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() - except ValueError: - eval_score = 0 - if self.config.verbose: - print("CV Score:", eval_score) - print("\n=================") - return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} - - trials = Trials() - search_space = {'source': hp.choice('data_source', ['a', 'b']), 'estimator': self.hparams} - best = fmin(objective_func, search_space, algo=hyperopt.rand.suggest, trials=trials, - max_evals=self.config.search_iter) - - if self.config.use_ensembling: - best_estimator_ = self.setup_ensemble(trials) - else: - opt = space_eval(self.hparams, best) - best_estimator_ = opt['model'](**opt['param']) - if self.config.verbose: - print("The best hyperparameter setting found:") - print(opt) - return best_estimator_, trials - - @abstractmethod - def save_model(self): - pass - - @abstractmethod - def get_skf(self, folds): - pass - - @staticmethod - def pick_diverse_estimators(trial_list, k): - groups = collections.defaultdict(list) - - for obj in trial_list: - groups[obj['space']['model']].append(obj) - estimator_list = [] - idx, j = 0, 0 - while idx < k: - for grp in groups.values(): - if j < len(grp): - model_params = grp[j]['space'] - est = model_params['model'](**model_params['param']) - estimator_list.append(est) - idx += 1 - j += 1 - return estimator_list - - def setup_ensemble(self, trials): - # Filter the unsuccessful hparam spaces i.e. 'loss' == 1 - best_trials = [t for t in trials.results if t['loss'] < 1] - best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) - - self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials)) - - if self.config.random_ensemble: - np.random.shuffle(best_trials) - - if self.config.diverse_ensemble: - estimator_list = self.pick_diverse_estimators(best_trials, self.config.num_estimators_ensemble) - else: - estimator_list = [] - for i in range(self.config.num_estimators_ensemble): - model_params = best_trials[i]['space'] - est = model_params['model'](**model_params['param']) - estimator_list.append(est) - - if self.config.ensemble_strategy == 'ranked_ensembling': - best_estimator_ = RankedEnsembler(estimator_list, config=self.config) - elif self.config.ensemble_strategy == 'stacking': - best_estimator_ = StackingEnsembler(estimator_list, config=self.config) - else: - best_estimator_ = RankedEnsembler(estimator_list, config=self.config) - return best_estimator_ - - -class Classifier(TabularEstimator): - """Classifier class. - It is used for tabular data classification. - """ - def __init__(self, config=None, **kwargs): - super().__init__(config, **kwargs) - self.config.objective = 'classification' - # TODO: add choice to the set of estimators - self.hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models]) - - def get_skf(self, folds): - if self.config.objective == 'binary': - score_metric = 'roc_auc' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) - else: - score_metric = 'f1_weighted' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) - return score_metric, skf - - -class Regressor(TabularEstimator): - """Regressor class. - It is used for tabular data regression. - """ - def __init__(self, config=None, **kwargs): - super().__init__(config, **kwargs) - self.config.objective = 'regression' - # TODO: add choice to the set of estimators - self.hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models]) - - def get_skf(self, folds): - return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 5f3423c..24781f2 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -16,7 +16,7 @@ LEVEL_HIGH = 32 -class TabularPreprocessor: +class TabularPreprocessor(TransformerMixin): def __init__(self, config): """ Initialization function for tabular preprocessor. @@ -25,7 +25,7 @@ def __init__(self, config): self.pipeline = None self.config = config - def fit(self, raw_x, y, data_info): + def fit(self, raw_x, y): """ This function should train the model parameters. @@ -38,16 +38,13 @@ def fit(self, raw_x, y, data_info): 'TIME' for temporal feature, 'NUM' for other numerical feature, and 'CAT' for categorical feature. """ - # Extract or read data info - self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x) - - data = TabularData(raw_x, self.data_info, self.config.verbose) + data = TabularData(raw_x, self.config.data_info, self.config.verbose) self.pipeline = Pipeline([ ('imputer', Imputation(selected_type='ALL', operation='upd')), # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')), # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')), - ('target_encoder', TargetEncoder(selected_type='CAT', operation='add')), + ('target_encoder', TargetEncoder(selected_type='CAT', operation='upd')), # ('count_encoder', CatCount(selected_type='CAT', operation='upd')), # ('one_hot_encoder', OneHot(selected_type='CAT', operation='upd')), # ('label_encoder', LabelEncode(selected_type='CAT', operation='upd')), @@ -78,28 +75,10 @@ def transform(self, raw_x): NOT be available for re-training. """ # Get Meta-Feature - data = TabularData(raw_x, self.data_info, self.config.verbose) + data = TabularData(raw_x, self.config.data_info, self.config.verbose) a = self.pipeline.transform(data).X return a.values - @staticmethod - def extract_data_info(raw_x): - """ - This function extracts the data info automatically based on the type of each feature in raw_x. - - Args: - raw_x: a numpy.ndarray instance containing the training data. - """ - data_info = [] - row_num, col_num = raw_x.shape - for col_idx in range(col_num): - try: - raw_x[:, col_idx].astype(np.float) - data_info.append('NUM') - except: - data_info.append('CAT') - return np.array(data_info) - class TabularData: cat_col = None @@ -127,12 +106,6 @@ def __init__(self, raw_x, data_info, verbose=True): self.X = raw_x # self.update_cat_cardinality() - if self.verbose: - print('DATA_INFO: {}'.format(self.data_info)) - print('#TIME features: {}'.format(self.n_time)) - print('#NUM features: {}'.format(self.n_num)) - print('#CAT features: {}'.format(self.n_cat)) - def update_type(self, columns, new_type): for c in columns: self.data_info[c] = new_type diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index 885160c..ec5af7a 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -18,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggleRegressor('regression') + clf = AutoKaggleRegressor() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) From 4749d5cd03cee5fa1404b5b953e43eecb4c5fb4a Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 19 Sep 2019 21:53:51 -0500 Subject: [PATCH 25/31] Fix higher order primitives --- autokaggle/__init__.py | 1 - autokaggle/preprocessor.py | 130 +++++++++++++++++++++++-------------- 2 files changed, 83 insertions(+), 48 deletions(-) diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py index 1a975c9..727b31e 100644 --- a/autokaggle/__init__.py +++ b/autokaggle/__init__.py @@ -1,3 +1,2 @@ from autokaggle.auto_ml import AutoKaggleClassifier, AutoKaggleRegressor -from autokaggle.estimators import * from autokaggle.ensemblers import * diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 24781f2..ebb3f43 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -12,17 +12,17 @@ from abc import abstractmethod import collections from lightgbm import LGBMClassifier, LGBMRegressor -from autokaggle.estimators import Config LEVEL_HIGH = 32 class TabularPreprocessor(TransformerMixin): + pipeline = None + data_info = None + def __init__(self, config): """ Initialization function for tabular preprocessor. """ - self.data_info = None - self.pipeline = None self.config = config def fit(self, raw_x, y): @@ -41,23 +41,23 @@ def fit(self, raw_x, y): data = TabularData(raw_x, self.config.data_info, self.config.verbose) self.pipeline = Pipeline([ - ('imputer', Imputation(selected_type='ALL', operation='upd')), - # ('cat_num_encoder', CatNumEncoder(selected_type1='CAT', selected_type2='NUM')), - # ('cat_num_encoder', CatCatEncoder(selected_type1='CAT', selected_type2='CAT')), - ('target_encoder', TargetEncoder(selected_type='CAT', operation='upd')), - # ('count_encoder', CatCount(selected_type='CAT', operation='upd')), - # ('one_hot_encoder', OneHot(selected_type='CAT', operation='upd')), - # ('label_encoder', LabelEncode(selected_type='CAT', operation='upd')), - ('scaler', TabScaler(selected_type='NUM', operation='upd')), - # ('boxcox', BoxCox(selected_type='NUM', operation='upd')), - # ('log_transform', LogTransform(selected_type='NUM', operation='upd')), - # ('binning', Binning(selected_type='NUM', operation='upd')), - # ('pca', TabPCA(selected_type='NUM', operation='add')), - # ('time_diff', TimeDiff(selected_type='TIME', operation='add')), - # ('time_offset', TimeOffset(selected_type='TIME', operation='upd')), - ('filter', FilterConstant(selected_type='ALL', operation='del')), - # ('pearson_corr', FeatureFilter(selected_type='ALL', operation='del')), - # ('lgbm_feat_selection', FeatureImportance(selected_type='ALL', operation='del')), + ('imputer', Imputation(operation='upd', selected_type='ALL')), + # ('cat_num_encoder', CatNumEncoder(operation='add', selected_type1='CAT', selected_type2='NUM')), + # ('cat_cat_encoder', CatCatEncoder(operation='add', selected_type1='CAT', selected_type2='CAT')), + ('target_encoder', TargetEncoder(operation='upd', selected_type='CAT')), + # ('count_encoder', CatCount(operation='upd', selected_type='CAT')), + # ('one_hot_encoder', OneHot(operation='upd', selected_type='CAT')), + # ('label_encoder', LabelEncode(operation='upd', selected_type='CAT')), + ('scaler', TabScaler(operation='upd', selected_type='NUM')), + # ('boxcox', BoxCox(operation='upd', selected_type='NUM')), + # ('log_transform', LogTransform(operation='upd', selected_type='NUM')), + # ('binning', Binning(operation='upd', selected_type='NUM')), + # ('pca', TabPCA(operation='add', selected_type='NUM')), + # ('time_diff', TimeDiff(operation='add', selected_type='TIME')), + # ('time_offset', TimeOffset(operation='upd', selected_type='TIME')), + ('filter', FilterConstant(operation='del', selected_type='ALL')), + # ('pearson_corr', FeatureFilter(operation='del', selected_type='ALL')), + # ('lgbm_feat_selection', FeatureImportance(operation='del', selected_type='ALL')), ]) self.pipeline.fit(data, y) @@ -179,7 +179,7 @@ class Primitive(BaseEstimator, TransformerMixin): supported_ops = ('add', 'upd', 'del') name_key = '' - def __init__(self, selected_type=None, operation='upd', **kwargs): + def __init__(self, operation='upd', selected_type=None, **kwargs): self.selected_type = selected_type self.operation = operation self.init_vars(**kwargs) @@ -211,10 +211,48 @@ def _transform(self, data, y=None): pass -class PrimitiveHigherOrder(Primitive): - def __init__(self, operation='upd', selected_type=None, selected_type2=None, **kwargs): - super().__init__(selected_type, operation, **kwargs) +class PrimitiveHigherOrder: + selected_1 = None + selected_2 = None + drop_columns = None + options = None + supported_ops = ('add', 'upd', 'del') + name_key = '' + + def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs): + self.operation = operation + self.selected_type1 = selected_type1 self.selected_type2 = selected_type2 + self.init_vars(**kwargs) + self.name_key = self.__class__.__name__ + + def init_vars(self, **kwargs): + self.options = kwargs + + def fit(self, data, y=None): + self.selected_1 = data.select_columns(self.selected_type1) + self.selected_2 = data.select_columns(self.selected_type2) + + if self.operation not in self.supported_ops: + print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__)) + self.selected_1 = None + self.selected_2 = None + if not self.selected_1 or not self.selected_2: + return self + return self._fit(data, y) + + def transform(self, data, y=None): + if not self.selected_1 or not self.selected_2: + return data + return self._transform(data, y) + + @abstractmethod + def _fit(self, data, y=None): + pass + + @abstractmethod + def _transform(self, data, y=None): + pass class TabScaler(Primitive): @@ -345,7 +383,9 @@ def _transform(self, data, y=None): class CatCatEncoder(PrimitiveHigherOrder): - supported_ops = ('add', 'upd') + supported_ops = ('add', ) + cat_cat_map = None + @staticmethod def cat_cat_count(df, col1, col2, strategy='count'): if strategy == 'count': @@ -359,28 +399,23 @@ def cat_cat_count(df, col1, col2, strategy='count'): def _fit(self, data, y=None): self.cat_cat_map = {} self.strategy = self.options.get('strategy', 'count') - for col1, col2 in itertools.combinations(self.selected, 2): + self.selected_1 = list(set(self.selected_1 + self.selected_2)) + for col1, col2 in itertools.combinations(self.selected_1, 2): self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(data.X, col1, col2, self.strategy) return self def _transform(self, data, y=None): x_tr = pd.DataFrame() - for col1, col2 in itertools.combinations(self.selected, 2): + for col1, col2 in itertools.combinations(self.selected_1, 2): if col1 + '_cross_' + col2 in self.cat_cat_map: x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2]) - # TODO: decide new_type - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key) return data class CatNumEncoder(PrimitiveHigherOrder): - supported_ops = ('add', 'upd') - - def __init__(self, selected_type=None, selected_num=[], operation='add', strategy='mean'): - super().__init__(selected_type, operation) - self.selected_num = selected_num - self.strategy = strategy - self.cat_num_map = {} + supported_ops = ('add', ) + cat_num_map = None @staticmethod def cat_num_interaction(df, col1, col2, method='mean'): @@ -400,23 +435,24 @@ def cat_num_interaction(df, col1, col2, method='mean'): def _fit(self, data, y=None): self.cat_num_map = {} self.strategy = self.options.get('strategy', 'mean') - for col1 in self.selected: - for col2 in self.selected_num: + for col1 in self.selected_1: + for col2 in self.selected_2: self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(data.X, col1, col2, self.strategy) return self def _transform(self, data, y=None): x_tr = pd.DataFrame() - for col1 in self.selected: - for col2 in self.selected_num: + for col1 in self.selected_1: + for col2 in self.selected_2: if col1 + '_cross_' + col2 in self.cat_num_map: x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_num_map[col1 + '_cross_' + col2]) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key) return data class CatBinEncoder(PrimitiveHigherOrder): - supported_ops = ('add', 'upd') + supported_ops = ('add', ) + cat_bin_map = None @staticmethod def cat_bin_interaction(df, col1, col2, strategy='percent_true'): @@ -431,18 +467,18 @@ def cat_bin_interaction(df, col1, col2, strategy='percent_true'): def _fit(self, data, y=None): self.cat_bin_map = {} self.strategy = self.options.get('strategy', 'percent_true') - for col1 in self.selected: - for col2 in self.selected_bin: + for col1 in self.selected_1: + for col2 in self.selected_2: self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(data.X, col1, col2, self.strategy) return self def _transform(self, data, y=None): x_tr = pd.DataFrame() - for col1 in self.selected: - for col2 in self.selected_bin: + for col1 in self.selected_1: + for col2 in self.selected_2: if col1 + '_cross_' + col2 in self.cat_bin_map: x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2]) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key) return data From f4df7fd24f7ea1abe37d4288743a48d5531efdd2 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Fri, 20 Sep 2019 01:22:25 -0500 Subject: [PATCH 26/31] Add params to preprocessor --- autokaggle/auto_ml.py | 2 +- autokaggle/config.py | 4 +- autokaggle/preprocessor.py | 153 +++++++++++++++++++++++++++++++------ 3 files changed, 131 insertions(+), 28 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 4707212..c7c021a 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -274,7 +274,7 @@ def __init__(self, model_class, m_params, p_params, config): self._estimator_type = 'classifier' if is_classifier(model_class) else 'regressor' def fit(self, x, y): - self.prep = TabularPreprocessor(self.config) + self.prep = TabularPreprocessor(self.config, self.p_params) self.model = self.model_class(**self.m_params) x = self.prep.fit_transform(x, y) self.model.fit(x, y) diff --git a/autokaggle/config.py b/autokaggle/config.py index d9be2ef..4e780ff 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -12,8 +12,8 @@ class Config: - def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=5, - ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3, + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, + ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False, data_info=None, balance_class_dist=False): self.verbose = verbose diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index ebb3f43..258374a 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -18,12 +18,15 @@ class TabularPreprocessor(TransformerMixin): pipeline = None data_info = None + params = None + config = None - def __init__(self, config): + def __init__(self, config, params): """ Initialization function for tabular preprocessor. """ self.config = config + self.params = params def fit(self, raw_x, y): """ @@ -40,25 +43,15 @@ def fit(self, raw_x, y): """ data = TabularData(raw_x, self.config.data_info, self.config.verbose) - self.pipeline = Pipeline([ - ('imputer', Imputation(operation='upd', selected_type='ALL')), - # ('cat_num_encoder', CatNumEncoder(operation='add', selected_type1='CAT', selected_type2='NUM')), - # ('cat_cat_encoder', CatCatEncoder(operation='add', selected_type1='CAT', selected_type2='CAT')), - ('target_encoder', TargetEncoder(operation='upd', selected_type='CAT')), - # ('count_encoder', CatCount(operation='upd', selected_type='CAT')), - # ('one_hot_encoder', OneHot(operation='upd', selected_type='CAT')), - # ('label_encoder', LabelEncode(operation='upd', selected_type='CAT')), - ('scaler', TabScaler(operation='upd', selected_type='NUM')), - # ('boxcox', BoxCox(operation='upd', selected_type='NUM')), - # ('log_transform', LogTransform(operation='upd', selected_type='NUM')), - # ('binning', Binning(operation='upd', selected_type='NUM')), - # ('pca', TabPCA(operation='add', selected_type='NUM')), - # ('time_diff', TimeDiff(operation='add', selected_type='TIME')), - # ('time_offset', TimeOffset(operation='upd', selected_type='TIME')), - ('filter', FilterConstant(operation='del', selected_type='ALL')), - # ('pearson_corr', FeatureFilter(operation='del', selected_type='ALL')), - # ('lgbm_feat_selection', FeatureImportance(operation='del', selected_type='ALL')), - ]) + steps = [] + steps.extend(self.get_imputation_pipeline(self.params)) + steps.extend(self.get_higher_order_pipeline(self.params)) + steps.extend(self.get_categorical_pipeline(self.params)) + steps.extend(self.get_numerical_pipeline(self.params)) + steps.extend(self.get_time_pipeline(self.params)) + steps.extend(self.get_filtering_pipeline(self.params)) + self.pipeline = Pipeline(steps) + self.pipeline.fit(data, y) return self @@ -79,6 +72,94 @@ def transform(self, raw_x): a = self.pipeline.transform(data).X return a.values + @staticmethod + def get_categorical_pipeline(params): + choice = params.get('cat_encoding', 'target') + cat_pipeline = [] + if choice == 'target': + cat_pipeline.append(('target_encoder', TargetEncoder(operation='upd', selected_type='CAT'))) + elif choice == 'label': + cat_pipeline.append(('label_encoder', LabelEncode(operation='upd', selected_type='CAT'))) + elif choice == 'count': + cat_pipeline.append(('count_encoder', CatCount(operation='upd', selected_type='CAT'))) + elif choice == 'target+count': + cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', selected_type='CAT'))) + cat_pipeline.append(('count_encoder', CatCount(operation='upd', selected_type='CAT'))) + elif choice == 'one_hot': + cat_pipeline.append(('one_hot_encoder', OneHot(operation='upd', selected_type='CAT'))) + elif choice == 'target+label': + cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', selected_type='CAT'))) + cat_pipeline.append(('label_encoder', LabelEncode(operation='upd', selected_type='CAT'))) + else: + raise ValueError + return cat_pipeline + + @staticmethod + def get_numerical_pipeline(params): + scaling = params.get('scaling', True) + log_transform = params.get('log_transform', False) + power_transform = params.get('power_transform', False) + pca = params.get('pca', False) + binning = params.get('binning', False) + + numeric_pipeline = [] + if scaling: + numeric_pipeline.append(('scaler', TabScaler(operation='upd', selected_type='NUM'))) + if log_transform: + numeric_pipeline.append(('log_transform', LogTransform(operation='upd', selected_type='NUM'))) + if power_transform: + numeric_pipeline.append(('boxcox', BoxCox(operation='upd', selected_type='NUM'))) + if pca: + numeric_pipeline.append(('pca', TabPCA(operation='add', selected_type='NUM'))) + if binning: + numeric_pipeline.append(('binning', Binning(operation='add', selected_type='NUM'))) + return numeric_pipeline + + def get_filtering_pipeline(self, params): + pearson_thresh = params.get('pearson_thresh', 0) + feat_importance_thresh = params.get('feat_importance_thresh', 0) + + filter_pipeline = [('filter', FilterConstant(operation='del', selected_type='ALL'))] + if pearson_thresh > 0: + filter_pipeline.append(('pearson_corr', FeatureFilter(operation='del', selected_type='ALL', + threshold=pearson_thresh))) + if feat_importance_thresh > 0: + filter_pipeline.append(('lgbm_feat_selection', FeatureImportance(operation='del', + selected_type='ALL', + threshold=feat_importance_thresh, + task_type=self.config.objective))) + return filter_pipeline + + @staticmethod + def get_time_pipeline(params): + add_offset = params.get('add_time_offset', False) + add_diff = params.get('add_time_diff', False) + time_pipeline = [] + if add_offset: + time_pipeline.append(('time_offset', TimeOffset(operation='upd', selected_type='TIME'))) + if add_diff: + time_pipeline.append(('time_diff', TimeDiff(operation='add', selected_type='TIME'))) + return time_pipeline + + @staticmethod + def get_imputation_pipeline(params): + strategy = params.get('imputation_strategy', 'most_frequent') + impute_pipeline = [('imputer', Imputation(operation='upd', selected_type='ALL', strategy=strategy))] + return impute_pipeline + + @staticmethod + def get_higher_order_pipeline(params): + cat_num_strategy = params.get('cat_num_strategy', None) + cat_cat_strategy = params.get('cat_cat_strategy', None) + pipeline = [] + if cat_num_strategy: + pipeline.append(('cat_num_encoder', CatNumEncoder(operation='add', selected_type1='CAT', + selected_type2='NUM', strategy=cat_num_strategy))) + if cat_cat_strategy: + pipeline.append(('cat_cat_encoder', CatCatEncoder(operation='add', selected_type1='CAT', + selected_type2='CAT', strategy=cat_cat_strategy))) + return pipeline + class TabularData: cat_col = None @@ -385,6 +466,10 @@ def _transform(self, data, y=None): class CatCatEncoder(PrimitiveHigherOrder): supported_ops = ('add', ) cat_cat_map = None + strategy = None + + def init_vars(self, strategy='count'): + self.strategy = strategy @staticmethod def cat_cat_count(df, col1, col2, strategy='count'): @@ -398,7 +483,6 @@ def cat_cat_count(df, col1, col2, strategy='count'): def _fit(self, data, y=None): self.cat_cat_map = {} - self.strategy = self.options.get('strategy', 'count') self.selected_1 = list(set(self.selected_1 + self.selected_2)) for col1, col2 in itertools.combinations(self.selected_1, 2): self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(data.X, col1, col2, self.strategy) @@ -416,6 +500,10 @@ def _transform(self, data, y=None): class CatNumEncoder(PrimitiveHigherOrder): supported_ops = ('add', ) cat_num_map = None + strategy = None + + def init_vars(self, strategy='mean'): + self.strategy = strategy @staticmethod def cat_num_interaction(df, col1, col2, method='mean'): @@ -434,7 +522,6 @@ def cat_num_interaction(df, col1, col2, method='mean'): def _fit(self, data, y=None): self.cat_num_map = {} - self.strategy = self.options.get('strategy', 'mean') for col1 in self.selected_1: for col2 in self.selected_2: self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(data.X, col1, col2, self.strategy) @@ -453,6 +540,10 @@ def _transform(self, data, y=None): class CatBinEncoder(PrimitiveHigherOrder): supported_ops = ('add', ) cat_bin_map = None + strategy = None + + def init_vars(self, strategy='percent_true'): + self.strategy = strategy @staticmethod def cat_bin_interaction(df, col1, col2, strategy='percent_true'): @@ -466,7 +557,6 @@ def cat_bin_interaction(df, col1, col2, strategy='percent_true'): def _fit(self, data, y=None): self.cat_bin_map = {} - self.strategy = self.options.get('strategy', 'percent_true') for col1 in self.selected_1: for col2 in self.selected_2: self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(data.X, col1, col2, self.strategy) @@ -578,12 +668,21 @@ def _transform(self, data, y=None): class Imputation(Primitive): impute_dict = None supported_ops = ('add', 'upd') + strategy = None + + def init_vars(self, strategy='most_frequent'): + self.strategy = strategy def _fit(self, data, y=None): self.impute_dict = {} for col in self.selected: - value_counts = data.X[col].value_counts() - self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0 + if self.strategy == 'most_frequent': + value_counts = data.X[col].value_counts() + self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0 + elif self.strategy == 'zero': + self.impute_dict[col] = 0 + else: + raise ValueError return self def _transform(self, data, y=None): @@ -599,6 +698,8 @@ class FeatureFilter(Primitive): supported_ops = ('del',) def init_vars(self, threshold=0.001): + if threshold == 0: + self.selected = None self.threshold = threshold self.drop_columns = [] @@ -622,6 +723,8 @@ class FeatureImportance(Primitive): supported_ops = ('del',) def init_vars(self, threshold=0.001, task_type='classification'): + if threshold == 0: + self.selected = None self.threshold = threshold self.drop_columns = [] self.task_type = task_type From d5b7f9ca0345eb161c217247355dc00fa06bfde8 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Fri, 20 Sep 2019 06:52:38 -0500 Subject: [PATCH 27/31] 2 rounds search --- autokaggle/auto_ml.py | 31 ++++++++++++++---- autokaggle/config.py | 71 ++++++++++++++++++++++++++++++++++++++-- autokaggle/ensemblers.py | 4 +-- 3 files changed, 94 insertions(+), 12 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index c7c021a..6c61da3 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -10,7 +10,8 @@ from autokaggle.preprocessor import TabularPreprocessor from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json from lightgbm import LGBMClassifier, LGBMRegressor -from autokaggle.config import Config, classification_hspace, regression_hspace +from autokaggle.config import Config, classification_hspace, regression_hspace, classification_hspace_base,\ + regression_hspace_base, regression_p_hspace_base, classification_p_hspace_base from sklearn.model_selection import StratifiedKFold, KFold import hyperopt from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL @@ -23,7 +24,9 @@ # TODO: Further clean the design of this file class AutoKaggle(BaseEstimator): pipeline = None - hparams = None + m_hparams = None + m_hparams_base = None + p_hparams_base = None def __init__(self, config=None, **kwargs): """ @@ -72,8 +75,13 @@ def fit(self, x, y, time_limit=None, data_info=None): self.config.objective = 'binary' if n_classes == 2 else 'multiclass' # self.pipeline = AutoPipe(LGBMClassifier, {}, {}, self.config) - prep_space = {'prep': hp.choice('data_source', ['a', 'b'])} - self.pipeline = self.get_best_pipeline(self.search(x, y, prep_space, self.hparams)) + # Search the top preprocessing setting + trials = self.search(x, y, self.p_hparams_base, self.m_hparams_base) + p_hparams = self.get_top_prep(trials) + # Search the best pipelines + trials = self.search(x, y, p_hparams, self.m_hparams_base) + self.pipeline = self.get_best_pipeline(trials) + # Fit data self.pipeline.fit(x, y) self.is_trained = True @@ -153,7 +161,7 @@ def objective_func(params): trials = Trials() search_space = {'prep': prep_space, 'estimator': model_space} - _ = fmin(objective_func, search_space, algo=hyperopt.rand.suggest, trials=trials, + _ = fmin(objective_func, search_space, algo=self.config.ensembling_algo, trials=trials, max_evals=self.config.search_iter) return trials @@ -168,6 +176,9 @@ def get_best_pipeline(self, trials): print(opt) return best_pipeline + def get_top_prep(self, trials): + return hp.choice('p_params', [res['p_params'] for res in trials.results]) + @abstractmethod def get_skf(self, folds): pass @@ -236,7 +247,10 @@ class AutoKaggleClassifier(AutoKaggle): def __init__(self, config=None, **kwargs): super().__init__(config, **kwargs) self.config.objective = 'classification' - self.hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models]) + self.m_hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models]) + self.m_hparams_base = hp.choice('classifier', + [classification_hspace_base[m] for m in self.config.classification_models]) + self.p_hparams_base = classification_p_hspace_base def get_skf(self, folds): if self.config.objective == 'binary': @@ -252,7 +266,10 @@ class AutoKaggleRegressor(AutoKaggle): def __init__(self, config=None, **kwargs): super().__init__(config, **kwargs) self.config.objective = 'regression' - self.hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models]) + self.m_hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models]) + self.m_hparams_base = hp.choice('regressor', + [regression_hspace_base[m] for m in self.config.classification_models]) + self.p_hparams_base = regression_p_hspace_base def get_skf(self, folds): return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) diff --git a/autokaggle/config.py b/autokaggle/config.py index 4e780ff..8855251 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -1,5 +1,6 @@ from sklearn.base import BaseEstimator from autokaggle.utils import rand_temp_folder_generator, ensure_dir +import hyperopt from hyperopt import hp from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier @@ -12,10 +13,10 @@ class Config: - def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, - ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3, + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=50, + ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False, - data_info=None, balance_class_dist=False): + data_info=None, balance_class_dist=False, ensembling_search_iter=10, ensembling_algo='random'): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() ensure_dir(self.path) @@ -41,6 +42,8 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True self.stack_probabilities = stack_probabilities self.data_info = data_info self.balance_class_dist = balance_class_dist + self.ensembling_search_iter = ensembling_search_iter + self.ensembling_algo = hyperopt.rand.suggest if ensembling_algo == 'random' else hyperopt.tpe.suggest def update(self, options): for k, v in options.items(): @@ -205,4 +208,66 @@ def update(self, options): 'model': CatBoostClassifier, 'param': catboost_classifier_params } +} + +classification_hspace_base = { + 'knn': { + 'model': KNeighborsClassifier, + 'param': {} + }, + 'svm': { + 'model': SVC, + 'param': {} + }, + 'random_forest': { + 'model': RandomForestClassifier, + 'param': {} + }, + 'lgbm': { + 'model': LGBMClassifier, + 'param': {} + }, + 'adaboost': { + 'model': AdaBoostClassifier, + 'param': {} + }, + 'catboost': { + 'model': CatBoostClassifier, + 'param': {} + } +} + +regression_hspace_base = { + 'extratree': { + 'model': ExtraTreesRegressor, + 'param': {} + }, + 'ridge': { + 'model': Ridge, + 'param': {} + }, + 'random_forest': { + 'model': RandomForestRegressor, + 'param': {} + }, + 'lgbm': { + 'model': LGBMRegressor, + 'param': {} + }, + 'adaboost': { + 'model': AdaBoostRegressor, + 'param': {} + }, + 'catboost': { + 'model': CatBoostRegressor, + 'param': {} + } +} + +regression_p_hspace_base = { + +} + +classification_p_hspace_base = { + } \ No newline at end of file diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index 68cafa7..30e411f 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -112,8 +112,8 @@ def objective_func(args): return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} trials = Trials() - best = fmin(objective_func, self.hparams, algo=hyperopt.rand.suggest, trials=trials, - max_evals=50) + best = fmin(objective_func, self.hparams, algo=self.config.ensembling_algo, trials=trials, + max_evals=self.config.ensembling_search_iter) opt = space_eval(self.hparams, best) best_estimator_ = opt['model'](**opt['param']) From 6d25537d4f0cc50f2013017d4f4f933730ac0eff Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Fri, 20 Sep 2019 09:12:49 -0500 Subject: [PATCH 28/31] Select best preprocessing settings --- autokaggle/auto_ml.py | 17 ++++++++++++++--- autokaggle/config.py | 8 +++++--- examples/benchmarking.py | 3 +-- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 6c61da3..978682b 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -77,7 +77,7 @@ def fit(self, x, y, time_limit=None, data_info=None): # self.pipeline = AutoPipe(LGBMClassifier, {}, {}, self.config) # Search the top preprocessing setting trials = self.search(x, y, self.p_hparams_base, self.m_hparams_base) - p_hparams = self.get_top_prep(trials) + p_hparams = self.get_top_prep(trials, self.config.num_p_hparams) # Search the best pipelines trials = self.search(x, y, p_hparams, self.m_hparams_base) self.pipeline = self.get_best_pipeline(trials) @@ -176,8 +176,19 @@ def get_best_pipeline(self, trials): print(opt) return best_pipeline - def get_top_prep(self, trials): - return hp.choice('p_params', [res['p_params'] for res in trials.results]) + @staticmethod + def get_top_prep(trials, n): + best_trials = [t for t in trials.results if t['loss'] != float('inf')] + best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) + top_p_hparams, count = [], 0 + for trial in best_trials: + if trial['p_params'] not in top_p_hparams: + top_p_hparams.append(trial) + count += 1 + if count > n: + break + + return hp.choice('p_params', top_p_hparams) @abstractmethod def get_skf(self, folds): diff --git a/autokaggle/config.py b/autokaggle/config.py index 8855251..610e84b 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -13,10 +13,11 @@ class Config: - def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=False, num_estimators_ensemble=50, - ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=5, cv_folds=3, + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, + ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False, - data_info=None, balance_class_dist=False, ensembling_search_iter=10, ensembling_algo='random'): + data_info=None, balance_class_dist=False, ensembling_search_iter=10, ensembling_algo='random', + num_p_hparams=10): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() ensure_dir(self.path) @@ -44,6 +45,7 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=Fals self.balance_class_dist = balance_class_dist self.ensembling_search_iter = ensembling_search_iter self.ensembling_algo = hyperopt.rand.suggest if ensembling_algo == 'random' else hyperopt.tpe.suggest + self.num_p_hparams = num_p_hparams def update(self, options): for k, v in options.items(): diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 50dab23..604d92e 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -280,14 +280,13 @@ def get_box_plot(data, task_id, metric): if __name__ == "__main__": regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839] - # classification_task_list = [3021] classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101, 9914, 3020, 3524, 3573, 3962] ak = BenchmarkingAutoKaggle(sess_name='test_perf') import time # t1 = time.time() - # for _ in range(3): + # for _ in range(1): # ak.run_automation(classification_task_list) # t2 = time.time() # print(t2-t1) From 8e0068b868a1af117ff4f29d049a902ade23fd33 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Fri, 20 Sep 2019 09:31:03 -0500 Subject: [PATCH 29/31] prep param space update --- autokaggle/config.py | 30 +++++++++++++++++++++++++++--- autokaggle/preprocessor.py | 3 +-- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/autokaggle/config.py b/autokaggle/config.py index 610e84b..89be536 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -267,9 +267,33 @@ def update(self, options): } regression_p_hspace_base = { - + 'cat_encoding': hp.choice('cat_enc', ['count', 'target+count', 'target+label', 'label']), + 'scaling': hp.choice('scaling', [True, False]), + 'log_transform': hp.choice('log_transform', [True, False]), + 'power_transform': hp.choice('power_transform', [True, False]), + 'pca': hp.choice('pca', [True, False]), + 'binning': hp.choice('binning', [True, False]), + 'add_time_offset': hp.choice('add_time_offset', [True, False]), + 'add_time_diff': hp.choice('add_time_diff', [True, False]), + # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', 'min', None]), + # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]), + 'imputation_strategy': hp.choice('imputation_strategy', ['most_frequent', 'zero']), + 'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01), + 'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01) } classification_p_hspace_base = { - -} \ No newline at end of file + 'cat_encoding': hp.choice('cat_enc', ['target', 'count', 'target+count', 'target+label']), + 'scaling': hp.choice('scaling', [True, False]), + 'log_transform': hp.choice('log_transform', [True, False]), + 'power_transform': hp.choice('power_transform', [True, False]), + 'pca': hp.choice('pca', [True, False]), + 'binning': hp.choice('binning', [True, False]), + 'add_time_offset': hp.choice('add_time_offset', [True, False]), + 'add_time_diff': hp.choice('add_time_diff', [True, False]), + # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', 'min', None]), + # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]), + 'imputation_strategy': hp.choice('imputation_strategy', ['most_frequent', 'zero']), + 'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01), + 'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01) +} diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 258374a..4b789d6 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -382,8 +382,7 @@ def _fit(self, data, y=None): return self def _transform(self, data, y=None): - x_tr = self.binner.transform(data.X[self.selected]) - # TODO: decide if cat or num new type + x_tr = pd.DataFrame(self.binner.transform(data.X[self.selected])) data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) return data From c29f15947c7bbbb5974c27ee81a9f74794103b74 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Tue, 24 Sep 2019 19:41:29 -0500 Subject: [PATCH 30/31] Address review comments --- autokaggle/__init__.py | 2 +- autokaggle/auto_ml.py | 343 ++++++++++--- autokaggle/config.py | 91 ++-- autokaggle/ensemblers.py | 182 +++---- autokaggle/preprocessor.py | 468 ++++++++++++++++-- examples/benchmarking.py | 167 ++++++- examples/tabular_classification_binary.py | 2 +- examples/tabular_classification_multiclass.py | 2 +- examples/tabular_regression.py | 2 +- 9 files changed, 981 insertions(+), 278 deletions(-) diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py index 727b31e..6744a7a 100644 --- a/autokaggle/__init__.py +++ b/autokaggle/__init__.py @@ -1,2 +1,2 @@ -from autokaggle.auto_ml import AutoKaggleClassifier, AutoKaggleRegressor +from autokaggle.auto_ml import Classifier, Regressor from autokaggle.ensemblers import * diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 978682b..4c2dc47 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -7,48 +7,79 @@ from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error from joblib import dump, load -from autokaggle.preprocessor import TabularPreprocessor +from autokaggle.preprocessor import Preprocessor from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json from lightgbm import LGBMClassifier, LGBMRegressor -from autokaggle.config import Config, classification_hspace, regression_hspace, classification_hspace_base,\ - regression_hspace_base, regression_p_hspace_base, classification_p_hspace_base +from autokaggle.config import Config, CLASSIFICATION_PREP_HPARAM_SPACE, REGRESSION_PREP_HPARAM_SPACE, \ + REGRESSION_BASE_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, REGRESSION_HPARAM_SPACE from sklearn.model_selection import StratifiedKFold, KFold import hyperopt from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL from sklearn.model_selection import cross_val_score -from autokaggle.ensemblers import RankedEnsembler, StackingEnsembler +from autokaggle.ensemblers import RankedEnsemblingModel, StackedEnsemblingModel from imblearn.over_sampling import SMOTE, SMOTENC import collections -# TODO: Further clean the design of this file class AutoKaggle(BaseEstimator): - pipeline = None - m_hparams = None - m_hparams_base = None - p_hparams_base = None - - def __init__(self, config=None, **kwargs): - """ - Initialization function for tabular supervised learner. - """ + """ Automated Machine Learning system class. + + AutoKaggle implements an end to end automated ML system. It initiates and searches for the optimum ML pipeline. + The user can use it with the simple `fit()` and `predict()` methods like Sci-kit learn estimators. + The user can specify various parameters controlling different components of the system. + # Arguments + path: String. OS path for storing temporary model parameters. + verbose: Bool. Defines the verbosity of the logging. + time_limit: Int. Time budget for performing search and fit pipeline. + use_ensembling: Bool. Defines whether to use an ensemble of models + num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble + ensemble_strategy: String. Strategy to ensemble models + ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked randomly. + diverse_ensemble: Bool. Whether estimators from different families are picked. + ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search + search_algo: String. Search strategy for hyper-parameter search. + search_iter: Int. Number of iterations used for hyper-parameter search. + cv_folds: Int. Number of Cross Validation folds. + subsample_ratio: Percent of subsample used for for hyper-parameter search. + data_info: list(String). Lists the datatypes of each feature column. + stack_probabilities: Bool. Whether to use class probabilities in ensembling. + upsample_classes: Bool. Whether to upsample less represented classes + num_p_hparams: Int. Number of preprocessor search spaces. + """ + + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, + num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10, + search_algo='random', num_p_hparams=10): self.is_trained = False - self.config = config if config else Config() - self.config.update(kwargs) - if not self.config.path: - self.config.path = rand_temp_folder_generator() + if not path: + path = rand_temp_folder_generator() + self.config = Config(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds, + subsample_ratio=subsample_ratio, random_ensemble=random_ensemble, + diverse_ensemble=diverse_ensemble, stack_probabilities=stack_probabilities, + data_info=data_info, upsample_classes=upsample_classes, + ensembling_search_iter=ensembling_search_iter, search_algo=search_algo, + num_p_hparams=num_p_hparams) + self.pipeline = None + self.m_hparams = None + self.m_hparams_base = None + self.p_hparams_base = None def fit(self, x, y, time_limit=None, data_info=None): - """ - This function should train the model parameters. - Args: + """ Train an autoML system. + # Arguments x: A numpy.ndarray instance containing the training data. y: training label vector. time_limit: remaining time budget. - data_info: meta-features of the dataset, which is an numpy.ndarray describing the - feature type of each column in raw_x. The feature type include: - 'TIME' for temporal feature, 'NUM' for other numerical feature, - and 'CAT' for categorical feature. + data_info: meta-features of the dataset, which is an numpy.ndarray describing the feature type of each + column in raw_x. The feature type include: 'TIME' for temporal feature, 'NUM' for other numerical feature, + and 'CAT' for categorical feature. + # Returns + None Both inputs X and y are numpy arrays. If fit is called multiple times on incremental data (train, test1, test2, etc.) you should warm-start your training from the pre-trained model. Past data will @@ -64,7 +95,7 @@ def fit(self, x, y, time_limit=None, data_info=None): print('#TIME features: {}'.format(sum(self.config.data_info == 'TIME'))) print('#NUM features: {}'.format(sum(self.config.data_info == 'NUM'))) print('#CAT features: {}'.format(sum(self.config.data_info == 'CAT'))) - + if x.shape[1] == 0: raise ValueError("No feature exist!") @@ -86,9 +117,12 @@ def fit(self, x, y, time_limit=None, data_info=None): self.is_trained = True def predict(self, x_test): - """ - This function should provide predictions of labels on (test) data. - The function predict eventually can return probabilities or continuous values. + """ Generate prediction on the test data for the given task. + # Arguments + x_test: A numpy.ndarray instance containing the test data. + # Returns + A numpy array for the predictions on the x_test. + This function provides predictions of labels on (test) data. """ y = self.pipeline.predict(x_test) if y is None: @@ -96,12 +130,26 @@ def predict(self, x_test): return y def predict_proba(self, x_test): + """ Predict label probabilities on the test data for the given classification task. + # Arguments + x_test: A numpy.ndarray instance containing the test data. + # Returns + A numpy array for the prediction probabilities on the x_test. + The function returns predicted probabilities for every class label. + """ y = self.pipeline.predict_proba(x_test) if y is None: raise ValueError("Tabular predictor does not exist") return y def evaluate(self, x_test, y_test): + """ Predict label probabilities on the test data for the given classification task. + # Arguments + x_test: A numpy.ndarray instance containing the training data. + y_test: A numpy array with ground truth labels for the test data + # Returns + An evaluation score based on the task type. + """ if self.config.verbose: print('objective:', self.config.objective) y_pred = self.predict(x_test) @@ -114,11 +162,15 @@ def evaluate(self, x_test, y_test): results = mean_squared_error(y_test, y_pred) return results - def final_fit(self, x_train, y_train): - self.pipeline.fit(x_train, y_train) - def resample(self, x, y): - if self.config.balance_class_dist: + """ Up-samples the input data + # Arguments + x: A numpy array for features + y: A numpy array for target + # Returns + Up-sampled version of the dataset + """ + if self.config.upsample_classes: x, y = SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(x, y) while x.shape[0] < 60: x = np.concatenate([x, x], axis=0) @@ -126,6 +178,14 @@ def resample(self, x, y): return x, y def subsample(self, x, y, sample_percent): + """ Takes a sub-sample of the input data, for the hyper-parameter search. + # Arguments + x: A numpy array for features + y: A numpy array for target + sample_percent: Minimum percentage of the data to be maintained + # Returns + Down-sampled dataset + """ # TODO: Add way to balance the subsample # Set small sample for hyper-param search if x.shape[0] > 600: @@ -138,6 +198,15 @@ def subsample(self, x, y, sample_percent): return grid_train_x, grid_train_y def search(self, x, y, prep_space, model_space): + """ Do hyper-parameter search to find optimal machine learning pipeline. + # Arguments + x: A numpy array for features + y: A numpy array for target + prep_space: Hyper-parameter search space for preprocessors + model_space: Hyper-parameter search space for estimators + # Returns + List of hyper-parameter trials + """ grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio) score_metric, skf = self.get_skf(self.config.cv_folds) @@ -149,7 +218,8 @@ def objective_func(params): try: eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() status = STATUS_OK - except ValueError: + except ValueError as e: + print(e) eval_score = float('-inf') status = STATUS_FAIL if self.config.verbose: @@ -161,11 +231,17 @@ def objective_func(params): trials = Trials() search_space = {'prep': prep_space, 'estimator': model_space} - _ = fmin(objective_func, search_space, algo=self.config.ensembling_algo, trials=trials, - max_evals=self.config.search_iter) + _ = fmin(objective_func, search_space, algo=self.config.search_algo, trials=trials, + max_evals=self.config.search_iter, rstate=np.random.RandomState(self.config.random_state)) return trials def get_best_pipeline(self, trials): + """ Finds the optimal pipeline from the given list of search trials. + # Arguments + trials: List of hyper-parameter search trials + # Returns + Optimal pipeline based on the given list of trials + """ if self.config.use_ensembling: best_pipeline = self.setup_ensemble(trials) else: @@ -178,6 +254,13 @@ def get_best_pipeline(self, trials): @staticmethod def get_top_prep(trials, n): + """ Find the list of top N preprocessor settings. + # Arguments + trials: List of hyper-parameter search trials + n: Maximum number of preprocessor settings required + # Returns + List of the top N optimal preprocessor settings. + """ best_trials = [t for t in trials.results if t['loss'] != float('inf')] best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) top_p_hparams, count = [], 0 @@ -192,16 +275,28 @@ def get_top_prep(trials, n): @abstractmethod def get_skf(self, folds): + """ Get the scoring metric and the cross validation folds for evaluation. + # Arguments + folds: NUmber of cross validation folds + # Returns + Scoring metric and cross validation folds. + """ pass - def pick_diverse_estimators(self, trial_list, k): + def pick_diverse_estimators(self, trial_list): + """ Selects the best hyper-parameter settings from each estimator family. + # Arguments + trial_list: List of the hyper-parameter search trials. + # Returns + List of top hyper-parameter spaces equally selected from each estimator family. + """ groups = collections.defaultdict(list) for obj in trial_list: groups[obj['model_class']].append(obj) estimator_list = [] idx, j = 0, 0 - while idx < k: + while idx < self.config.num_estimators_ensemble: for grp in groups.values(): if j < len(grp): est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'], grp[j]['p_params'], self.config) @@ -211,6 +306,12 @@ def pick_diverse_estimators(self, trial_list, k): return estimator_list def setup_ensemble(self, trials): + """ Generates the optimal ensembling estimator based on the given setting. + # Arguments + trials: List of the hyper-parameter search trials. + # Returns + An ensembling estimator to be trained using the base estimators picked from trials. + """ # Filter the unsuccessful hparam spaces i.e. 'loss' == float('inf') best_trials = [t for t in trials.results if t['loss'] != float('inf')] best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) @@ -221,7 +322,7 @@ def setup_ensemble(self, trials): np.random.shuffle(best_trials) if self.config.diverse_ensemble: - estimator_list = self.pick_diverse_estimators(best_trials, self.config.num_estimators_ensemble) + estimator_list = self.pick_diverse_estimators(best_trials) else: estimator_list = [] for i in range(self.config.num_estimators_ensemble): @@ -230,18 +331,19 @@ def setup_ensemble(self, trials): estimator_list.append(est) if self.config.ensemble_strategy == 'stacking': - best_estimator_ = StackingEnsembler(estimator_list, config=self.config) + best_estimator_ = StackedEnsemblingModel(estimator_list, config=self.config) else: - best_estimator_ = RankedEnsembler(estimator_list, config=self.config) + best_estimator_ = RankedEnsemblingModel(estimator_list, config=self.config) return best_estimator_ @staticmethod def extract_data_info(raw_x): """ - This function extracts the data info automatically based on the type of each feature in raw_x. - - Args: + Extracts the data info automatically based on the type of each feature in raw_x. + # Arguments raw_x: a numpy.ndarray instance containing the training data. + # Returns + A list of data-types for each feature in the data. """ data_info = [] row_num, col_num = raw_x.shape @@ -254,16 +356,54 @@ def extract_data_info(raw_x): return np.array(data_info) -class AutoKaggleClassifier(AutoKaggle): - def __init__(self, config=None, **kwargs): - super().__init__(config, **kwargs) +class Classifier(AutoKaggle): + """ Extends AutoKaggle for Classification. + + Extends the AutoKaggle specific to the classification requirements. + # Arguments + path: String. OS path for storing temporary model parameters. + verbose: Bool. Defines the verbosity of the logging. + time_limit: Int. Time budget for performing search and fit pipeline. + use_ensembling: Bool. Defines whether to use an ensemble of models + num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble + ensemble_strategy: String. Strategy to ensemble models + ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked randomly. + diverse_ensemble: Bool. Whether estimators from different families are picked. + ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search + search_algo: String. Search strategy for hyper-parameter search. + search_iter: Int. Number of iterations used for hyper-parameter search. + cv_folds: Int. Number of Cross Validation folds. + subsample_ratio: Percent of subsample used for for hyper-parameter search. + data_info: list(String). Lists the datatypes of each feature column. + stack_probabilities: Bool. Whether to use class probabilities in ensembling. + upsample_classes: Bool. Whether to upsample less represented classes + num_p_hparams: Int. Number of preprocessor search spaces. + """ + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, + num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10, + search_algo='random', num_p_hparams=10): + super().__init__(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds, + subsample_ratio=subsample_ratio, random_ensemble=random_ensemble, diverse_ensemble=diverse_ensemble, + stack_probabilities=stack_probabilities, data_info=data_info, + upsample_classes=upsample_classes, ensembling_search_iter=ensembling_search_iter, + search_algo=search_algo, num_p_hparams=num_p_hparams) self.config.objective = 'classification' - self.m_hparams = hp.choice('classifier', [classification_hspace[m] for m in self.config.classification_models]) + self.m_hparams = hp.choice('classifier', [CLASSIFICATION_HPARAM_SPACE[m] for m in + self.config.classification_models]) self.m_hparams_base = hp.choice('classifier', - [classification_hspace_base[m] for m in self.config.classification_models]) - self.p_hparams_base = classification_p_hspace_base + [CLASSIFICATION_BASE_HPARAM_SPACE[m] for m in + self.config.classification_models]) + self.p_hparams_base = CLASSIFICATION_PREP_HPARAM_SPACE def get_skf(self, folds): + """ + See the base class. + """ if self.config.objective == 'binary': score_metric = 'roc_auc' skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) @@ -273,28 +413,75 @@ def get_skf(self, folds): return score_metric, skf -class AutoKaggleRegressor(AutoKaggle): - def __init__(self, config=None, **kwargs): - super().__init__(config, **kwargs) +class Regressor(AutoKaggle): + """ Extends AutoKaggle for Regression + + Extends the AutoKaggle specific to the regression requirements. + # Arguments + path: String. OS path for storing temporary model parameters. + verbose: Bool. Defines the verbosity of the logging. + time_limit: Int. Time budget for performing search and fit pipeline. + use_ensembling: Bool. Defines whether to use an ensemble of models + num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble + ensemble_strategy: String. Strategy to ensemble models + ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked randomly. + diverse_ensemble: Bool. Whether estimators from different families are picked. + ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search + search_algo: String. Search strategy for hyper-parameter search. + search_iter: Int. Number of iterations used for hyper-parameter search. + cv_folds: Int. Number of Cross Validation folds. + subsample_ratio: Percent of subsample used for for hyper-parameter search. + data_info: list(String). Lists the datatypes of each feature column. + stack_probabilities: Bool. Whether to use class probabilities in ensembling. + upsample_classes: Bool. Whether to upsample less represented classes + num_p_hparams: Int. Number of preprocessor search spaces. + """ + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, + num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10, + search_algo='random', num_p_hparams=10): + super().__init__(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds, + subsample_ratio=subsample_ratio, random_ensemble=random_ensemble, + diverse_ensemble=diverse_ensemble, + stack_probabilities=stack_probabilities, data_info=data_info, + upsample_classes=upsample_classes, ensembling_search_iter=ensembling_search_iter, + search_algo=search_algo, num_p_hparams=num_p_hparams) self.config.objective = 'regression' - self.m_hparams = hp.choice('regressor', [regression_hspace[m] for m in self.config.regression_models]) + self.m_hparams = hp.choice('regressor', [REGRESSION_HPARAM_SPACE[m] for m in self.config.regression_models]) self.m_hparams_base = hp.choice('regressor', - [regression_hspace_base[m] for m in self.config.classification_models]) - self.p_hparams_base = regression_p_hspace_base + [REGRESSION_BASE_HPARAM_SPACE[m] for m in self.config.classification_models]) + self.p_hparams_base = REGRESSION_PREP_HPARAM_SPACE def get_skf(self, folds): + """ + See the base class. + """ return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) class AutoPipe(BaseEstimator): - prep = None - model = None - config = None - m_params = None - p_params = None - model_class = None - + """ Implements a machine learning pipeline. + + Implements a machine learning pipeline with preprocessor and estimator. A user can call fit(), and predict() + methods on it. It is used as a search unit in AutoKaggle's hyeper-parameter search. + # Arguments + config: Config. Defines the configuration of various components of the pipeline. + m_params: Dict. Hyper-parameter search space for estimator. + p_params: Dict. Hyper-parameter search space for preprocessor. + model_class: Estimator. Class name of the estimator used in the pipeline. + _estimator_type: String. Denotes if the estimator is 'classifier' or 'regressor' + prep: Preprocessor. Instance of the Preprocessor class, which does basic feature preprocessing and feature + engineering + model: Estimator. Instance of the estimator class which learns a machine learning model and predicts on the + given data. + """ def __init__(self, model_class, m_params, p_params, config): + self.prep = None + self.model = None self.config = config self.m_params = m_params self.p_params = p_params @@ -302,16 +489,37 @@ def __init__(self, model_class, m_params, p_params, config): self._estimator_type = 'classifier' if is_classifier(model_class) else 'regressor' def fit(self, x, y): - self.prep = TabularPreprocessor(self.config, self.p_params) + """ Trains the given pipeline. + # Arguments + x: A numpy.ndarray instance containing the training data. + y: training label vector. + # Returns + None + """ + self.prep = Preprocessor(self.config, self.p_params) self.model = self.model_class(**self.m_params) x = self.prep.fit_transform(x, y) self.model.fit(x, y) def predict(self, x): + """ Generate prediction on the test data for the given task. + # Arguments + x: A numpy.ndarray instance containing the test data. + # Returns + A numpy array for the predictions on the x. + This function provides predictions of labels on (test) data. + """ x = self.prep.transform(x) return self.model.predict(x) def predict_proba(self, x): + """ Predict label probabilities on the test data for the given classification task. + # Arguments + x: A numpy.ndarray instance containing the test data. + # Returns + A numpy array for the prediction probabilities on the x. + The function returns predicted probabilities for every class label. + """ x = self.prep.transform(x) try: return self.model.predict_proba(x) @@ -319,6 +527,13 @@ def predict_proba(self, x): return self.model.predict(x) def decision_function(self, x): + """ Returns the decision function learned by the estimator. + # Arguments + x: A numpy.ndarray instance containing the test data. + # Returns + Decision function learned by the estimator. + This is used by the scorers to evaluate the pipeline. + """ x = self.prep.transform(x) try: return self.model.decision_function(x) diff --git a/autokaggle/config.py b/autokaggle/config.py index 89be536..87a4f30 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -13,10 +13,35 @@ class Config: + """ Configuration for various autoML components. + + Defines the common configuration of different auto ML components. It is shared between AutoKaggle, AutoPipe, + Preprocessor and Ensembling class. + + # Arguments + path: String. OS path for storing temporary model parameters. + verbose: Bool. Defines the verbosity of the logging. + time_limit: Int. Time budget for performing search and fit pipeline. + use_ensembling: Bool. Defines whether to use an ensemble of models + num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble + ensemble_strategy: String. Strategy to ensemble models + ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked randomly. + diverse_ensemble: Bool. Whether estimators from different families are picked. + ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search + search_algo: String. Search strategy for hyper-parameter search. + search_iter: Int. Number of iterations used for hyper-parameter search. + cv_folds: Int. Number of Cross Validation folds. + subsample_ratio: Percent of subsample used for for hyper-parameter search. + data_info: list(String). Lists the datatypes of each feature column. + stack_probabilities: Bool. Whether to use class probabilities in ensembling. + upsample_classes: Bool. Whether to upsample less represented classes + num_p_hparams: Int. Number of preprocessor search spaces. + """ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False, - data_info=None, balance_class_dist=False, ensembling_search_iter=10, ensembling_algo='random', + data_info=None, upsample_classes=False, ensembling_search_iter=10, search_algo='random', num_p_hparams=10): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() @@ -42,9 +67,9 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True self.diverse_ensemble = diverse_ensemble self.stack_probabilities = stack_probabilities self.data_info = data_info - self.balance_class_dist = balance_class_dist + self.upsample_classes = upsample_classes self.ensembling_search_iter = ensembling_search_iter - self.ensembling_algo = hyperopt.rand.suggest if ensembling_algo == 'random' else hyperopt.tpe.suggest + self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else hyperopt.tpe.suggest self.num_p_hparams = num_p_hparams def update(self, options): @@ -53,14 +78,14 @@ def update(self, options): setattr(self, k, v) -knn_classifier_params = { +KNN_CLASSIFIER_PARAMS = { 'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]), 'weights': hp.choice('weight_knn', ['uniform', 'distance']), 'metric': hp.choice('metric_knn', ["euclidean", "manhattan", "chebyshev", "minkowski"]), 'p': hp.choice('p_knn', range(1, 3)), } -svc_params = { +SVM_CLASSIFIER_PARAMS = { 'C': hp.loguniform('C_svm', np.log(0.03125), np.log(32768)), 'kernel': hp.choice('kernel_svm', ['rbf', 'poly', 'sigmoid']), 'degree': hp.choice('degree_svm', range(2, 6)), @@ -68,7 +93,7 @@ def update(self, options): 'max_iter': 50000, } -random_forest_classifier_params = { +RANDOM_FOREST_CLASSIFIER_PARAMS = { 'criterion': hp.choice('criterion_rf', ['entropy', 'gini']), 'max_features': hp.uniform('max_features_rf', 0, 1.0), 'n_estimators': hp.choice('n_estimators_rf', [100, 50]), @@ -76,7 +101,7 @@ def update(self, options): 'min_samples_split': hp.choice('min_samples_split_rf', range(2, 20)), } -lgbm_classifier_params = { +LGBM_CLASSIFIER_PARAMS = { 'boosting_type': 'gbdt', 'min_split_gain': 0.1, 'subsample': 0.8, @@ -88,13 +113,13 @@ def update(self, options): 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2), high=np.log(2)), } -adaboost_classifier_params = { +ADABOOST_CLASSIFIER_PARAMS = { 'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)), 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)), } -catboost_classifier_params = { +CATBOOST_CLASSIFIER_PARAMS = { 'iterations': hp.choice('iterations_catboost', [5, 10]), 'depth': hp.choice('depth_catboost', range(4, 11)), 'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), high=np.log(1)), @@ -104,7 +129,7 @@ def update(self, options): 'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3)) } -extra_trees_regressor_params = { +EXTRA_TREES_REGRESSOR_PARAMS = { 'n_estimators': hp.choice('n_estimators_extra_trees', [50, 100, 200]), 'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']), 'max_features': hp.uniform('max_features_extra_trees', 0, 1.0), @@ -114,13 +139,13 @@ def update(self, options): 'bootstrap': hp.choice('bootstrap_extra_trees', [True, False]), } -ridge_params = { +RIDGE_REGRESSOR_PARAMS = { 'fit_intercept': True, 'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1), 'alpha': hp.loguniform('alpha_ridge', np.log(1e-5), np.log(10)) } -random_forest_regressor_params = { +RANDOM_FOREST_REGRESSOR_PARAMS = { 'criterion': hp.choice('criterion_rf', ['mse', 'friedman_mse', 'mae']), 'max_features': hp.uniform('max_features_rf', 0.1, 1.0), 'n_estimators': hp.choice('n_estimators_rf', [50, 100, 200]), @@ -129,7 +154,7 @@ def update(self, options): 'bootstrap': hp.choice('bootstrap_rf', [True, False]), } -lgbm_regressor_params = { +LGBM_REGRESSOR_PARAMS = { 'boosting_type': 'gbdt', 'min_split_gain': 0.1, 'subsample': 0.8, @@ -141,14 +166,14 @@ def update(self, options): 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5), high=np.log(1)), } -adaboost_regressor_params = { +ADABOOST_REGRESSOR_PARAMS = { 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)), 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)), # 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)), } -catboost_regressor_params = { +CATBOOST_REGRESSOR_PARAMS = { 'iterations': 2, 'depth': hp.choice('depth_catboost', range(4, 10)), 'learning_rate': 1, @@ -157,62 +182,62 @@ def update(self, options): } -regression_hspace = { +REGRESSION_HPARAM_SPACE = { 'extratree': { 'model': ExtraTreesRegressor, - 'param': extra_trees_regressor_params + 'param': EXTRA_TREES_REGRESSOR_PARAMS }, 'ridge': { 'model': Ridge, - 'param': ridge_params + 'param': RIDGE_REGRESSOR_PARAMS }, 'random_forest': { 'model': RandomForestRegressor, - 'param': random_forest_regressor_params + 'param': RANDOM_FOREST_REGRESSOR_PARAMS }, 'lgbm': { 'model': LGBMRegressor, - 'param': lgbm_regressor_params + 'param': LGBM_REGRESSOR_PARAMS }, 'adaboost': { 'model': AdaBoostRegressor, - 'param': adaboost_regressor_params + 'param': ADABOOST_REGRESSOR_PARAMS }, 'catboost': { 'model': CatBoostRegressor, - 'param': catboost_regressor_params + 'param': CATBOOST_REGRESSOR_PARAMS } } -classification_hspace = { +CLASSIFICATION_HPARAM_SPACE = { 'knn': { 'model': KNeighborsClassifier, - 'param': knn_classifier_params + 'param': KNN_CLASSIFIER_PARAMS }, 'svm': { 'model': SVC, - 'param': svc_params + 'param': SVM_CLASSIFIER_PARAMS }, 'random_forest': { 'model': RandomForestClassifier, - 'param': random_forest_classifier_params + 'param': RANDOM_FOREST_CLASSIFIER_PARAMS }, 'lgbm': { 'model': LGBMClassifier, - 'param': lgbm_classifier_params + 'param': LGBM_CLASSIFIER_PARAMS }, 'adaboost': { 'model': AdaBoostClassifier, - 'param': adaboost_classifier_params + 'param': ADABOOST_CLASSIFIER_PARAMS }, 'catboost': { 'model': CatBoostClassifier, - 'param': catboost_classifier_params + 'param': CATBOOST_CLASSIFIER_PARAMS } } -classification_hspace_base = { +CLASSIFICATION_BASE_HPARAM_SPACE = { 'knn': { 'model': KNeighborsClassifier, 'param': {} @@ -239,7 +264,7 @@ def update(self, options): } } -regression_hspace_base = { +REGRESSION_BASE_HPARAM_SPACE = { 'extratree': { 'model': ExtraTreesRegressor, 'param': {} @@ -266,7 +291,7 @@ def update(self, options): } } -regression_p_hspace_base = { +REGRESSION_PREP_HPARAM_SPACE = { 'cat_encoding': hp.choice('cat_enc', ['count', 'target+count', 'target+label', 'label']), 'scaling': hp.choice('scaling', [True, False]), 'log_transform': hp.choice('log_transform', [True, False]), @@ -282,7 +307,7 @@ def update(self, options): 'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01) } -classification_p_hspace_base = { +CLASSIFICATION_PREP_HPARAM_SPACE = { 'cat_encoding': hp.choice('cat_enc', ['target', 'count', 'target+count', 'target+label']), 'scaling': hp.choice('scaling', [True, False]), 'log_transform': hp.choice('log_transform', [True, False]), diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index 30e411f..fa394d2 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -17,62 +17,99 @@ from sklearn.model_selection import RandomizedSearchCV, cross_val_score import hyperopt from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK -from autokaggle.config import classification_hspace, regression_hspace +from autokaggle.config import REGRESSION_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE,\ + REGRESSION_BASE_HPARAM_SPACE -lgbm_classifier_params = { - 'n_estimators': hp.choice('n_estimators', [100, 150, 200]), -} +class EnsemblingModel: + """ Base class for ensembling estimators. -_classification_hspace = { - 'lgbm': { - 'model': LGBMClassifier, - 'param': lgbm_classifier_params - }, -} + This class creates an ensembling estimator from a given list of estimators. The user can call fit() and + predict() methods, similar to the scikit-learn estimators. - -class RankedEnsembler: + # Arguments + config: Config. Defines the configuration of various components of the autoML pipeline. + estimator_list: List. List of the estimators, to be used for building an ensemble. + """ def __init__(self, estimator_list, config): self.config = config - self.ensemble_method = config.ensemble_method - self.estimators = estimator_list - + self.estimator_list = estimator_list + + @abstractmethod + def fit(self, X, y): + """ Trains the ensemble of estimators on the training data. + # Arguments + X: A numpy array instance containing the training data. + # Returns + None + """ + pass + + @abstractmethod + def predict(self, X): + """ Generate prediction on the test data for the given task. + # Arguments + X: A numpy array instance containing the test data. + # Returns + A numpy array for the predictions on the x_test. + This function provides predicts on the input data using the ensemble of estimators. + """ + pass + + +class RankedEnsemblingModel(EnsemblingModel): + """ Implements ensembling using ranking based methods. + + This class implements randing based ensembling using ensembling methods amongst: ('mean', 'median', 'max' and + 'majority_voting') + """ def fit(self, X, y): - for est in self.estimators: + for est in self.estimator_list: est.fit(X, y) def predict(self, X): - predictions = np.zeros((len(X), len(self.estimators))) - for i, est in enumerate(self.estimators): + predictions = np.zeros((len(X), len(self.estimator_list))) + for i, est in enumerate(self.estimator_list): predictions[:, i] = est.predict(X) - if self.ensemble_method == 'median': + if self.config.ensemble_method == 'median': return np.median(predictions, axis=1) - elif self.ensemble_method == 'mean': + elif self.config.ensemble_method == 'mean': return np.mean(predictions, axis=1) - elif self.ensemble_method == 'max': + elif self.config.ensemble_method == 'max': return np.max(predictions, axis=1) - elif self.ensemble_method == 'min': + elif self.config.ensemble_method == 'min': return np.min(predictions, axis=1) - elif self.ensemble_method == 'max_voting': + elif self.config.ensemble_method == 'max_voting': return stats.mode(predictions, axis=1)[0] -class StackingEnsembler: - stacking_estimator = None +class StackedEnsemblingModel(EnsemblingModel): + """ Implements a stacking based ensembling estimator. + + This class creates an ensembling estimator using stacking. It trains an Light-GBM model on the predictions of + thh base estimator. + # Arguments + stacking_estimator: LightGBM estimator. Meta-learning algorithm for the stacking estimator. + """ def __init__(self, estimator_list, config): - self.config = config - self.estimator_list = estimator_list - self.objective = config.objective + super().__init__(estimator_list, config) + self.stacking_estimator = None + if self.config.objective == 'regression': - self.hparams = hp.choice('regressor', [regression_hspace[m] for m in ['lgbm']]) + self.hparams = hp.choice('regressor', [REGRESSION_BASE_HPARAM_SPACE['lgbm']]) self.config.stack_probabilities = False else: - self.hparams = hp.choice('classifier', [_classification_hspace[m] for m in ['lgbm']]) + self.hparams = hp.choice('classifier', [CLASSIFICATION_BASE_HPARAM_SPACE['lgbm']]) def get_model_predictions(self, X): + """ Generate the combined predictions from the list of the estimators. + # Arguments + X: A numpy array instance containing the training/test data. + # Returns + A numpy array for the predictions of all the estimators in the list. + """ if self.config.stack_probabilities: predictions = np.zeros((len(X), 1)) for i, est in enumerate(self.estimator_list): @@ -98,6 +135,12 @@ def fit(self, X, y): self.stacking_estimator.fit(predictions, y_val) def search(self, x, y): + """ Search function to find best hyper-param setting for the stacking model. + # Arguments + x: A numpy array instance containing the training data + # Returns + List of trials on various hyper-parameter settings. + """ score_metric, skf = self.get_skf(self.config.cv_folds) def objective_func(args): @@ -112,8 +155,8 @@ def objective_func(args): return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} trials = Trials() - best = fmin(objective_func, self.hparams, algo=self.config.ensembling_algo, trials=trials, - max_evals=self.config.ensembling_search_iter) + best = fmin(objective_func, self.hparams, algo=self.config.search_algo, trials=trials, + max_evals=self.config.ensembling_search_iter, rstate=np.random.RandomState(self.config.random_state)) opt = space_eval(self.hparams, best) best_estimator_ = opt['model'](**opt['param']) @@ -127,6 +170,12 @@ def predict(self, X): return self.stacking_estimator.predict(predictions) def get_skf(self, folds): + """ Get scoring metric and cross validation folds for the task type + # Arguments + folds: Number of cross validation folds + # Returns + Scoring metric and CV folds + """ if self.config.objective == 'binary': score_metric = 'roc_auc' skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) @@ -139,72 +188,3 @@ def get_skf(self, folds): else: ValueError("Invalid objective") return score_metric, skf - - -class EnsembleSelection: - indices_ = None - weights_ = None - - def __init__(self, estimator_list, config): - self.estimator_list = estimator_list - self.config = config - self.objective = config.objective - self.indices_, self.weights_ = [], [] - self.ensemble_size = len(estimator_list) - if self.objective == 'regression': - self.score_metric = 'neg_mean_squared_error' - self.skf = KFold(n_splits=3, shuffle=True, random_state=1001) - else: - self.score_metric = 'neg_mean_squared_error' - self.skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=1001) - - def fit(self, X, y_true): - """Rich Caruana's ensemble selection method. (fast version)""" - ensemble = [] - trajectory = [] - order = [] - - for i in range(self.ensemble_size): - scores = np.zeros((len(self.estimator_list))) - s = len(ensemble) - if s == 0: - weighted_ensemble_prediction = np.zeros(len(self.estimator_list)) - else: - # Memory-efficient averaging! - ensemble_prediction = np.zeros(ensemble[0].shape) - for est in ensemble: - ensemble_prediction += est - ensemble_prediction /= s - - weighted_ensemble_prediction = (s / float(s + 1)) * ensemble_prediction - fant_ensemble_prediction = np.zeros(weighted_ensemble_prediction.shape) - for j, est in enumerate(self.estimator_list): - fant_ensemble_prediction[:,:] = weighted_ensemble_prediction + \ - (1. / float(s + 1)) * est - scores[j] = cross_val_score(self.estimator_list[j], X, y_true, scoring=self.score_metric, - cv=self.skf).mean() - - all_best = np.argwhere(scores == np.nanmin(scores)).flatten() - best = np.random.RandomState.choice(all_best) - ensemble.append(self.estimator_list[best]) - trajectory.append(scores[best]) - order.append(best) - - # Handle special case - if len(self.estimator_list) == 1: - break - - self.indices_ = order - ensemble_members = collections.Counter(self.indices_) - weights = np.zeros((self.ensemble_size,), dtype=float) - for ensemble_member in ensemble_members: - weight = float(ensemble_member[1]) / self.ensemble_size - weights[ensemble_member[0]] = weight - - if np.sum(weights) < 1: - weights = weights / np.sum(weights) - - self.weights_ = weights - - def predict(self, X): - return np.average(X, axis=1, weights=self.weights_) diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index 4b789d6..d286dd9 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -15,31 +15,30 @@ LEVEL_HIGH = 32 -class TabularPreprocessor(TransformerMixin): - pipeline = None - data_info = None - params = None - config = None - +class Preprocessor(TransformerMixin): + """ Implements basic preprocessing and feature engineering class. + + Preprocessor takes care of the basic preprocessing and feature engineering of the input data. Similar to + Scikit-learn transformers, it implements the fit() and transform() methods. TO acheive this It applies various + feature primitives in a sequence using scikit-learn pipeline. + # Arguments + config: Config. Defines the configuration of various components of the autoML pipeline. + params: Dict. Hyper-parameter search space for preprocessor. + pipeline: Pipeline. Sci-kit learn pipeline class to apply the feature primitives in sequence + """ def __init__(self, config, params): - """ - Initialization function for tabular preprocessor. - """ self.config = config self.params = params + self.pipeline = None def fit(self, raw_x, y): - """ - This function should train the model parameters. - - Args: - raw_x: a numpy.ndarray instance containing the training data. - y: training label vector. - time_limit: remaining time budget. - data_info: meta-features of the dataset, which is an numpy.ndarray describing the - feature type of each column in raw_x. The feature type include: - 'TIME' for temporal feature, 'NUM' for other numerical feature, - and 'CAT' for categorical feature. + """ This function trains the preprocessor chain + # Arguments + raw_x: A numpy array instance containing the training data data. + y: A numpy array instance containing training label vector. + # Returns + None + This function fits the preprocessor chain on the given training data """ data = TabularData(raw_x, self.config.data_info, self.config.verbose) @@ -57,15 +56,13 @@ def fit(self, raw_x, y): return self def transform(self, raw_x): - """ - This function should train the model parameters. - - Args: - raw_x: a numpy.ndarray instance containing the training/testing data. - Both inputs X and y are numpy arrays. - If fit is called multiple times on incremental data (train, test1, test2, etc.) - you should warm-start your training from the pre-trained model. Past data will - NOT be available for re-training. + """ Generate data transformation on the given data. + # Arguments + raw_x: a numpy array instance containing the training/testing data + # Returns + A numpy array instance containing the transformed data. + This function provides transforms the input data by applying the transformations using the pre-trained + preprocessor chain. """ # Get Meta-Feature data = TabularData(raw_x, self.config.data_info, self.config.verbose) @@ -74,6 +71,12 @@ def transform(self, raw_x): @staticmethod def get_categorical_pipeline(params): + """ Generate pipeline of primitives for categorical features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ choice = params.get('cat_encoding', 'target') cat_pipeline = [] if choice == 'target': @@ -96,6 +99,12 @@ def get_categorical_pipeline(params): @staticmethod def get_numerical_pipeline(params): + """ Generate pipeline of primitives for numerical features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ scaling = params.get('scaling', True) log_transform = params.get('log_transform', False) power_transform = params.get('power_transform', False) @@ -116,6 +125,12 @@ def get_numerical_pipeline(params): return numeric_pipeline def get_filtering_pipeline(self, params): + """ Generate pipeline of primitives to filter less useful features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ pearson_thresh = params.get('pearson_thresh', 0) feat_importance_thresh = params.get('feat_importance_thresh', 0) @@ -132,6 +147,12 @@ def get_filtering_pipeline(self, params): @staticmethod def get_time_pipeline(params): + """ Generate pipeline of primitives for time features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ add_offset = params.get('add_time_offset', False) add_diff = params.get('add_time_diff', False) time_pipeline = [] @@ -143,12 +164,24 @@ def get_time_pipeline(params): @staticmethod def get_imputation_pipeline(params): + """ Generate pipeline of primitives to impute the missing values. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ strategy = params.get('imputation_strategy', 'most_frequent') impute_pipeline = [('imputer', Imputation(operation='upd', selected_type='ALL', strategy=strategy))] return impute_pipeline @staticmethod def get_higher_order_pipeline(params): + """ Generate pipeline of primitives to generate cross-column features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ cat_num_strategy = params.get('cat_num_strategy', None) cat_cat_strategy = params.get('cat_cat_strategy', None) pipeline = [] @@ -162,16 +195,35 @@ def get_higher_order_pipeline(params): class TabularData: - cat_col = None - num_col = None - time_col = None - n_cat, n_time, n_num = 0, 0, 0 - cat_cardinality = None - generated_features = None - feature_options = None - num_info = None - + """ Represents the data and its meta-info. + + TabularData includes the training/testing data along with its meta info such as data types, cardinality etc. The + user can update the data and its meta info as well as select the features matching the criteria. + # Arguments + verbose: Bool. Determines the verbosity of the logging. + data_info: Dict. Dictionary mapping the feature names to their data_types. + total_samples: Int. Number of samples in the data + cat_col: List. List of the categorical features + num_col: List. List of the numerical features + time_col: List. List of the time features + n_cat: Int. Number of categorical features + n_num: Int. Number of numerical features + n_time: Int. Number of time features + cat_cardinality: Dict. Dictionary mapping categorical feature names of their cardinality (no. of unique + values) + generated_features: List. List of the newly added features. (In addition to the pre-existing columns) + num_info: Dict. Dictionary mapping numeircal column to their meta info such as range, std etc. + """ def __init__(self, raw_x, data_info, verbose=True): + self.cat_col = None + self.num_col = None + self.time_col = None + self.n_cat = 0 + self.n_time = 0 + self.n_num = 0 + self.cat_cardinality = None + self.generated_features = None + self.num_info = None self.verbose = verbose self.data_info = {str(i): data_info[i] for i in range(len(data_info))} self.total_samples = raw_x.shape[0] @@ -188,14 +240,35 @@ def __init__(self, raw_x, data_info, verbose=True): # self.update_cat_cardinality() def update_type(self, columns, new_type): + """ Updates the column datatype. + # Arguments + column: List of columns whose data_type needs update. + new_type: New data_type (either of 'CAT', 'NUM' or 'TIME'). + # Returns + None. + This function updates the data types of given list of columns. + """ for c in columns: self.data_info[c] = new_type def delete_type(self, columns): + """ Delete the columns from the feature to data_type mapping. + # Arguments + column: List of columns whose data_type needs update. + # Returns + None + This function removes the selected columns from the data_info dictionary. + """ for c in columns: _ = self.data_info.pop(c, 0) def rename_cols(self, key): + """ Provides a rename function to add new columns without collision. + # Arguments + key: Identifier for renaming + # Returns + Renaming function which takes current column name and outputs a new unique column name. + """ def rename_fn(col_name): col_name = str(col_name) col_name += '_' + key @@ -205,6 +278,18 @@ def rename_fn(col_name): return rename_fn def update(self, operation, columns, x_tr, new_type=None, key=''): + """ Updates the TabularData after applying primitive. + # Arguments + operation: Primitive operation applied ('add', 'update' or 'delete'). + columns: List of columns affected. + x_tr: Transformed (or newly generated) features + new_type: Data type of the new column + key: Name key for renaming the new columns + # Returns + None + This function takes the transformed (or generated) features after applying the primitive and updates the + TabularData. + """ if operation == 'upd': if x_tr is not None: self.X[columns] = x_tr @@ -224,6 +309,13 @@ def update(self, operation, columns, x_tr, new_type=None, key=''): self.refresh_col_types() def refresh_col_types(self): + """ Updates the column_types based on the data_info + # Arguments + None + # Returns + None + This function updates the cat, num and time column lists based on (any) updates in the data_info. + """ self.cat_col = [k for k, v in self.data_info.items() if v == 'CAT'] self.num_col = [k for k, v in self.data_info.items() if v == 'NUM'] self.time_col = [k for k, v in self.data_info.items() if v == 'TIME'] @@ -232,6 +324,12 @@ def refresh_col_types(self): self.n_cat = len(self.cat_col) def update_cat_cardinality(self): + """ Update categorical cardinality mapping for all categorical columns. + # Arguments + None + # Returns + None + """ # TODO: too slow make it faster if not self.cat_cardinality: self.cat_cardinality = {} @@ -239,6 +337,12 @@ def update_cat_cardinality(self): self.cat_cardinality[c] = len(set(self.X[c])) def select_columns(self, data_type): + """ Returns all the columns matching the input data_type + # Arguments + data_type: Required type of the data (either of 'CAT', 'NUM', 'TIME' or 'ALL') + # Returns + List of the feature columns matching the input criteria. + """ self.refresh_col_types() if data_type == 'CAT': return self.cat_col @@ -254,22 +358,47 @@ def select_columns(self, data_type): class Primitive(BaseEstimator, TransformerMixin): - selected = None - drop_columns = None - options = None - supported_ops = ('add', 'upd', 'del') - name_key = '' - + """ Base class for the single order data transformation function. + + Primitive learns and applies the data transformation on a given set of features. The user can use fit() and + transform() functions to apply these transformations. + + # Arguments + options: Dict. Special arguments specific to the given primitive. + selected_type: 'String'. Specifies the type of features the transformation is supposed to be applied to. + operation: 'String'. Specifies the type of operation from 'add', 'update' or 'delete' + name_key : 'String'. Signature key to rename the column after applying the primitive. + selected: 'List'. List of the selected features, on which the transformation will be applied + drop_columns: 'List'. List of the features which would be dropped after applying the transformation. + supported_ops: Tuple. Specifies the allowed list of operations for this primitive. + """ def __init__(self, operation='upd', selected_type=None, **kwargs): + self.options = None + self.selected = None + self.drop_columns = None + self.supported_ops = ('add', 'upd', 'del') self.selected_type = selected_type self.operation = operation self.init_vars(**kwargs) self.name_key = self.__class__.__name__ def init_vars(self, **kwargs): + """ Initialize the primitive specific variables (which are not defined in the base class) + # Arguments + kwargs: Dictionary containing primitive specific variables + # Returns + None. + """ self.options = kwargs def fit(self, data, y=None): + """ A wrapper function to train the given primitive on the input training data. + # Arguments + data: A TabularData instance of training data. + y: A numpy array of the target values. + # Returns + None + """ self.selected = data.select_columns(self.selected_type) if self.operation not in self.supported_ops: print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__)) @@ -279,28 +408,68 @@ def fit(self, data, y=None): return self._fit(data, y) def transform(self, data, y=None): + """ A wrapper function to generate transformation on the input data based on pre-trained primitive. + # Arguments + data: Input training/testing data in TabularData form. + y: A numpy array of the target values. + # Returns + A TabularData instance of the transformed data. + """ if not self.selected: return data return self._transform(data, y) @abstractmethod def _fit(self, data, y=None): + """ Contains the actual implementation of training the primitive (implemented in the child class) + # Arguments + data: A TabularData instance of training data. + y: A numpy array of the target values. + # Returns + None + """ pass @abstractmethod def _transform(self, data, y=None): + """ Contains the actual implementation of transforming the data using primitive. (implemented in the child + class) + # Arguments + data: Input training/testing data in TabularData form. + y: A numpy array of the target values. + # Returns + A TabularData instance of the transformed data. + """ pass class PrimitiveHigherOrder: - selected_1 = None - selected_2 = None - drop_columns = None - options = None - supported_ops = ('add', 'upd', 'del') - name_key = '' - + """ Base class for the cross-order data transformation function. + + PrimitiveHigherOrder learns and applies the data transformation across two sets of features. The user can use + fit() and transform() functions to apply these transformations. + + # Arguments + options: Dict. Special arguments specific to the given primitive. + selected_type1: 'String'. Specifies the first type of features the transformation is supposed to be applied + to. + selected_type2: 'String'. Specifies the second type of features the transformation is supposed to be applied + to. + operation: 'String'. Specifies the type of operation from 'add', 'update' or 'delete' + name_key : 'String'. Signature key to rename the column after applying the primitive. + selected_1: 'List'. List of the selected features in the first set, on which the transformation will be + applied + selected_2: 'List'. List of the selected features in the second set, on which the transformation will be + applied + drop_columns: 'List'. List of the features which would be dropped after applying the transformation. + supported_ops: Tuple. Specifies the allowed list of operations for this primitive. + """ def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs): + self.options = None + self.selected_1 = None + self.selected_2 = None + self.drop_columns = None + self.supported_ops = ('add', 'upd', 'del') self.operation = operation self.selected_type1 = selected_type1 self.selected_type2 = selected_type2 @@ -308,9 +477,22 @@ def __init__(self, operation='upd', selected_type1=None, selected_type2=None, ** self.name_key = self.__class__.__name__ def init_vars(self, **kwargs): + """ Initialize the primitive specific variables (which are not defined in the base class) + # Arguments + kwargs: Dictionary containing primitive specific variables + # Returns + None. + """ self.options = kwargs def fit(self, data, y=None): + """ A wrapper function to train the given primitive on the input training data. + # Arguments + data: A TabularData instance of training data. + y: A numpy array of the target values. + # Returns + None + """ self.selected_1 = data.select_columns(self.selected_type1) self.selected_2 = data.select_columns(self.selected_type2) @@ -323,20 +505,49 @@ def fit(self, data, y=None): return self._fit(data, y) def transform(self, data, y=None): + """ A wrapper function to generate transformation on the input data based on pre-trained primitive. + # Arguments + data: Input training/testing data in TabularData form. + y: A numpy array of the target values. + # Returns + A TabularData instance of the transformed data. + """ if not self.selected_1 or not self.selected_2: return data return self._transform(data, y) @abstractmethod def _fit(self, data, y=None): + """ Contains the actual implementation of training the primitive (implemented in the child class) + # Arguments + data: A TabularData instance of training data. + y: A numpy array of the target values. + # Returns + None + """ pass @abstractmethod def _transform(self, data, y=None): + """ Contains the actual implementation of transforming the data using primitive. (implemented in the child + class) + # Arguments + data: Input training/testing data in TabularData form. + y: A numpy array of the target values. + # Returns + A TabularData instance of the transformed data. + """ pass class TabScaler(Primitive): + """ Standard Scaler primitive. + + TabScaler scales the selected numerical features to have 0 mean and unit variance. + + # Arguments + scaler: StandardScaler. Instance of scikit-learn StandardScaler object + """ scaler = None supported_ops = ('add', 'upd') @@ -352,6 +563,13 @@ def _transform(self, data, y=None): class BoxCox(Primitive): + """ Power Transform primitive. + + The class applies BoxCox power transformation to make the selected features have normal distribution. + + # Arguments + transformer: PowerTransformer. Instance of scikit-learn PowerTransformer object + """ transformer = None supported_ops = ('add', 'upd') @@ -367,6 +585,17 @@ def _transform(self, data, y=None): class Binning(Primitive): + """ Numerical binning primitive. + + The class applies divides the given numeric column in the list of buckets, based on the range of their values. + + # Arguments + binner: KBinsDiscretizer. Instance of scikit-learn KBinsDiscretizer object + strategy: String. Strategy used to define width of the bins. Possible options are: (‘uniform’, ‘quantile’, + ‘kmeans’) + encoding: String. Method used to encode the transformed result. Possible options are: (‘onehot’, + ‘onehot-dense’, ‘ordinal’) + """ binner = None strategy = None encoding = None @@ -388,6 +617,13 @@ def _transform(self, data, y=None): class OneHot(Primitive): + """ One Hot Encoder for categorical features. + + The class applies one hot encoding to categorical features, using the sklearn implementation. + + # Arguments + ohe: OneHotEncoder. Instance of scikit-learn OneHotEncoder object + """ ohe = None supported_ops = ('add', 'upd') @@ -407,6 +643,15 @@ def _transform(self, data, y=None): class LabelEncode(Primitive): + """ Label Encoder for categorical features. + + The class applies Label Encoding to categorical features, By mapping each category to a numerical value. + + # Arguments + cat_to_int_label: Dict. Mapping from categories to their assigned integer value + unknown_key_dict: Dict. Mapping for each categorical feature column to the integer value to replace the + previously unseen categories + """ cat_to_int_label = None unknown_key_dict = None supported_ops = ('add', 'upd') @@ -428,11 +673,28 @@ def _transform(self, data, y=None): class TargetEncoder(Primitive): + """ Target Encoder for categorical features. + + The class applies target encoding to categorical features, By learning the mapping of category to numeric value + based on some aggregation of the target value. + + # Arguments + target_encoding_map: Dict. Mapping from categories to their assigned numeric value + """ target_encoding_map = None supported_ops = ('add', 'upd') @staticmethod def calc_smooth_mean(df, by, on, alpha=5): + """ Calculates the smoothed means on the target value. + # Arguments + df: Input dataframe + by: Groupby column (categorical column) + on: Target column + alpha: smoothing factor + # Returns + smoothed mean and the overall mean + """ # Compute the global mean mean = df[on].mean() @@ -463,6 +725,14 @@ def _transform(self, data, y=None): class CatCatEncoder(PrimitiveHigherOrder): + """ Cross column feature generator between categorical and categorical columns. + + The class learns a new features based on the values of selected two categorical features. + + # Arguments + cat_cat_map: Dict. Mapping from cat-cat combination to the an assigned numeric value + strategy: String. Aggregation strategy to learn the mapping between cat-cat combination to numeric value + """ supported_ops = ('add', ) cat_cat_map = None strategy = None @@ -472,6 +742,14 @@ def init_vars(self, strategy='count'): @staticmethod def cat_cat_count(df, col1, col2, strategy='count'): + """ Generate mapping for cat-cat combination to the numerical value based on the given strategy. + # Arguments + col1: First categorical column + col2: Second categorical column + strategy: Aggregation strategy + # Returns + Mapping from cat-cat combination to the numeric value.. + """ if strategy == 'count': mapping = df.groupby([col1])[col2].count() elif strategy == 'nunique': @@ -497,6 +775,14 @@ def _transform(self, data, y=None): class CatNumEncoder(PrimitiveHigherOrder): + """ Cross column feature generator between categorical and numerical columns. + + The class learns a new features based on the values of selected categorical and numerical features. + + # Arguments + cat_num_map: Dict. Mapping from cat-num combination to the an assigned numeric value + strategy: String. Aggregation strategy to learn the mapping between cat-num combination to numeric value + """ supported_ops = ('add', ) cat_num_map = None strategy = None @@ -506,6 +792,14 @@ def init_vars(self, strategy='mean'): @staticmethod def cat_num_interaction(df, col1, col2, method='mean'): + """ Generate mapping for cat-num combination to the numerical value based on the given strategy. + # Arguments + col1: categorical column + col2: numerical column + method: Aggregation strategy + # Returns + Mapping from cat-num combination to the numeric value.. + """ if method == 'mean': mapping = df.groupby([col1])[col2].mean() elif method == 'std': @@ -537,6 +831,14 @@ def _transform(self, data, y=None): class CatBinEncoder(PrimitiveHigherOrder): + """ Cross column feature generator between categorical and binary columns. + + The class learns a new features based on the values of selected categorical and binary features. + + # Arguments + cat_bin_map: Dict. Mapping from cat-bin combination to the an assigned numeric value + strategy: String. Aggregation strategy to learn the mapping between cat-bin combination to numeric value + """ supported_ops = ('add', ) cat_bin_map = None strategy = None @@ -546,6 +848,14 @@ def init_vars(self, strategy='percent_true'): @staticmethod def cat_bin_interaction(df, col1, col2, strategy='percent_true'): + """ Generate mapping for cat-bin combination to the numerical value based on the given strategy. + # Arguments + col1: Categorical column + col2: Binary column + strategy: Aggregation strategy + # Returns + Mapping from cat-bin combination to the numeric value.. + """ if strategy == 'percent_true': mapping = df.groupby([col1])[col2].mean() elif strategy == 'count': @@ -572,6 +882,11 @@ def _transform(self, data, y=None): class FilterConstant(Primitive): + """ Filters the constant or very low variance columns. + + The class finds the non-changing or very low variance columns and marked them for deletion, so that they are + not used by the machine learning estimator. + """ drop_columns = None supported_ops = ('del',) @@ -586,6 +901,10 @@ def _transform(self, data, y=None): class TimeDiff(Primitive): + """ Adds features based on difference of time values. + + This class generates the features as time difference between two selected time columns. + """ supported_ops = ('add', ) def _fit(self, data, y=None): @@ -600,6 +919,13 @@ def _transform(self, data, y=None): class TimeOffset(Primitive): + """ Updates the time features in terms of difference from the start value. + + This class updates the time features such that they are represented as a difference from the start time. + + # Arguments + start_time: Int. Starting time of the selected time feature. + """ start_time = None supported_ops = ('add', 'upd') @@ -615,6 +941,12 @@ def _transform(self, data, y=None): class TabPCA(Primitive): + """ Generates new features by finding PCA of the selected features. + + The class calculates the PCA of the selected features and adds the transformation as new set of features. + # Arguments + pca: PCA. Scikit-lean PCA class. + """ pca = None supported_ops = ('add', ) @@ -631,6 +963,13 @@ def _transform(self, data, y=None): class CatCount(Primitive): + """ Count Encoding. + + Replaces the cargorical variables by their occrance count. + # Arguments + count_dict: Dict. Mapping of the categories to their respective frequency count. + unknown_key: Float. Mapping value for previously unseen category. + """ count_dict = None unknown_key = 0 supported_ops = ('add', 'upd') @@ -650,6 +989,11 @@ def _transform(self, data, y=None): class LogTransform(Primitive): + """ Calculates the log transformation. + + The class Calculates the log transform value of the given numeric feature. The formula is: + sign(x) * log(1 + mod(x)) + """ name_key = 'log_' supported_ops = ('add', 'upd') @@ -665,6 +1009,12 @@ def _transform(self, data, y=None): class Imputation(Primitive): + """ Filters the features based on Pearson Correlation. + + The class removes the features who have low pearson correlation with the target. + # Arguments + threshold: Float. Threshold for filtering features. + """ impute_dict = None supported_ops = ('add', 'upd') strategy = None @@ -693,6 +1043,12 @@ def _transform(self, data, y=None): class FeatureFilter(Primitive): + """ Filters the features based on Pearson Correlation. + + The class removes the features who have low pearson correlation with the target. + # Arguments + threshold: Float. Threshold for filtering features. + """ threshold = None supported_ops = ('del',) @@ -717,6 +1073,14 @@ def _transform(self, data, y=None): class FeatureImportance(Primitive): + """ Filters the features based on feature importance score. + + The class learns a Light GBM estimator for the given data and based on the feature importance scores, filters + the features with importance lower than the threshold. + # Arguments + threshold: Float. Threshold for filtering features. + task_type: 'String'. Specifies the task type amongst: ('classification', 'regression') + """ threshold = None task_type = 'classification' supported_ops = ('del',) @@ -776,7 +1140,7 @@ def _transform(self, data, y=None): datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) print(x_train[:4, 20]) - prep = TabularPreprocessor() + prep = Preprocessor() prep.fit(x_train, y_train, 24*60*60, datainfo) x_new = prep.transform(x_train) diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 604d92e..0b5b22a 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -16,6 +16,7 @@ openml.config.apikey = '3c7196c92a274c3b9405a7e26e9f848e' import warnings from abc import abstractmethod +import statistics def generate_rand_string(size): @@ -24,6 +25,20 @@ def generate_rand_string(size): class BenchmarkingBase: + """ Base class for benchmarking autoML platforms. + + This class benchmarks the performance of the given autoML platform. The user can call evaluate() method to + evaluate the performance on a single task or run_automation() for the list of the tasks. The tasks are OpenML + tasks, which specify the dataset and the train/test/validation folds etc. + + # Arguments + results: List. List of the results for each evaluation + sess_name: String. Name of the evaluation session, used for storing the results. + cls_desc: List. List of the columns to be added in classification result + rgs_desc: List. List of the columns to be added in regression result + cls_results: DataFrame. Table storing the classification results + rgs_results: DataFrame. Table storing the regression results + """ results = None cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", "balanced_accuracy", "F1_score", "AUC"] rgs_desc = ["automl_model", "task_id", "time_limit", "MSE", "MAE", "R2_score"] @@ -37,6 +52,14 @@ def __init__(self, supress_warnings=True, sess_name=""): self.rgs_results = pd.DataFrame(columns=self.rgs_desc) def measure_performance_cls(self, y_true, y_pred, binary=False): + """ Calculate the performance of the classification task + # Arguments + y_true: A numpy array containing the ground truth labels + y_pred: A numpy array containing the predicted labels + binary: Boolean specifying if the objective isbinary or multiclass + # Returns + list of the performance scores based on various evaluation metrics. + """ accuracy = accuracy_score(y_true, y_pred) ber = balanced_accuracy_score(y_true, y_pred) f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score(y_true, y_pred, average="weighted") @@ -44,12 +67,25 @@ def measure_performance_cls(self, y_true, y_pred, binary=False): return [accuracy, ber, f1, auc] def measure_performance_rgs(self, y_true, y_pred): + """ Calculate the performance of the regression task + # Arguments + y_true: A numpy array containing the ground truth + y_pred: A numpy array containing the predicted values + # Returns + list of the performance scores based on various evaluation metrics. + """ mse = mean_squared_error(y_true, y_pred) mae = mean_absolute_error(y_true, y_pred) r2 = r2_score(y_true, y_pred) return [mse, mae, r2] def export_results(self): + """ Writes the results to a CSV file. + # Arguments + None + # Returns + None + """ if len(self.cls_results) > 0: self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False) if len(self.rgs_results) > 0: @@ -57,9 +93,23 @@ def export_results(self): @abstractmethod def evaluate(self, task, time_limit): + """ Evaluates the performance of the single task. + # Arguments + task: Id of the OpenML task flow + time_limit: Budget for the given task + # Returns + List of performance scores of the autoML system on the given task. + """ pass def run_automation(self, task_list, time_limit=10*60): + """ Evaluate the list of the tasks in sequence + # Arguments + task_list: List of OpenML task ids + time_limit: Budget for each of the task + # Returns + None + """ for task in task_list: try: self.evaluate(task, time_limit=time_limit) @@ -68,12 +118,27 @@ def run_automation(self, task_list, time_limit=10*60): print("task: {} didnt work".format(task)) def time_lapse(self, task_id, time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]): + """ Evaluate the task on different time_limits + # Arguments + task_id: Id of the OpenML task flow + time_limits: List of the time_limits to test the performance on + # Returns + List of combined results of the autoML on each of the time_limit + This function evaluates and compares the performance of the autoML system on different time_limits. It is + helpful to understand the amount of improvement with increase in time budget + """ tl_results = [] for time_limit in time_limits: tl_results.append(self.evaluate(task_id, time_limit=time_limit)) return tl_results def get_dataset_splits(self, task_id): + """ Get the train/test splits for the given task + # Arguments + task_id: Id of OpenML task flow + # Returns + Train/Test datasets in numpy array format + """ task = openml.tasks.get_task(task_id) train_indices, test_indices = task.get_train_test_split_indices() dataset = task.get_dataset() @@ -85,12 +150,18 @@ def get_dataset_splits(self, task_id): class BenchmarkingAutoKaggle(BenchmarkingBase): - estimator_type = None - - def set_estimator_type(self, est_type): - self.estimator_type = est_type - + """ Extends the benchmarking class for evaluating AutoKaggle. + + This class evaluates the performance of AutoKaggle on the input classification or regression task_list. + """ def get_data_info(self, dataset, num_cols): + """ Get the info of each feature data type + # Arguments + dataset: dataset id in OpenML + num_cols: Total number of columns + # Returns + A numpy array containing the data_type of each feature column + """ nominal_feat = dataset.get_features_by_type('nominal') numerical_feat = dataset.get_features_by_type('numeric') string_feat = dataset.get_features_by_type('string') @@ -107,6 +178,9 @@ def get_data_info(self, dataset, num_cols): return np.array(data_info) def evaluate(self, task_id, time_limit=10*60): + """ + See base class. + """ task_info = ["autokaggle", task_id, time_limit] task = openml.tasks.get_task(task_id) train_indices, test_indices = task.get_train_test_split_indices() @@ -121,9 +195,9 @@ def evaluate(self, task_id, time_limit=10*60): # Train if task.task_type == 'Supervised Classification': - automl = AutoKaggleClassifier() + automl = Classifier() elif task.task_type == 'Supervised Regression': - automl = AutoKaggleRegressor() + automl = Regressor() else: print("UNSUPPORTED TASK_TYPE") assert(0) @@ -145,6 +219,10 @@ def evaluate(self, task_id, time_limit=10*60): # # class BenchmarkingAutoSklearn(BenchmarkingBase): + """ Extends the benchmarking class for evaluating AutoSklearn. + + This class evaluates the performance of AutoKaggle on the input classification or regression task_list. + """ # def get_data_info(self, categorical_indicator): # return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator] # @@ -187,14 +265,29 @@ def evaluate(self, task_id, time_limit=10*60): # self.results.append(result) # print(result) # return result - + + def get_dataset_ids(task_ids): + """ Fetches the dataset_ids. + # Arguments + task_ids: List of ids of OpenML task flows + # Returns + dataset_list: List of the dataset Ids + """ if type(task_ids) == list: - return [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids] + return [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids] else: - return openml.tasks.get_task(task_ids).dataset_id + return openml.tasks.get_task(task_ids).dataset_id + def get_task_info(task_ids): + """ Fetches the dataset_ids and the task objective. + # Arguments + task_ids: List of ids of OpenML task flows. + # Returns + dataset_list: List of the dataset Ids. + task_types: List of the task type (such as 'binary/multiclass classification' or 'regression' + """ task_types = [] dataset_list = [] for i, t_id in enumerate(task_ids): @@ -211,6 +304,13 @@ def get_task_info(task_ids): def get_dataset_properties(task_ids): + """ Fetches the properties of the dataset for given task flow id + # Arguments + task_ids: List of ids of OpenML task flows + # Returns + Dataframe containing the info of each of the dataset. + This function provides the dataset info such as number of instances, number of numeric/nominal/string columns etc. + """ dataset_list, task_types = get_task_info(task_ids) df = pd.DataFrame(columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", "#Date"]) for i, dataset in enumerate(dataset_list): @@ -225,9 +325,16 @@ def get_dataset_properties(task_ids): ] return df + def get_performance_table(filename, metric): - """ - Read the results csv and convert into the performance table based on the median of the results for each task. + """ Generates a comprehensive report table of AutoML performance. + # Arguments + filename: A csv file containing the results of AutoML runs + metric: Scoring metric to be used for comparison + # Returns + Pandas Dataframe listing the performance of different AutoML systems on the given datasets. + This function reads the results csv and converts it into the performance table based on the median of the results + for each task. """ test = pd.read_csv(filename) perf = pd.DataFrame(columns=["Name", "AutoKaggle", "AutoSklearn", "H2O.ai"]) @@ -247,27 +354,37 @@ def get_performance_table(filename, metric): print(e) return perf + def style_results(res): - """ - Highlight the max results and set index to name + """ Highlights the best result in the results column + # Arguments + res: Dataframe containing the results of various AutoML runs + # Returns + Highlighed data-frame """ def highlight_max(s): - ''' - highlight the maximum in a Series yellow. - ''' + """ + Highlight the maximum in a Series yellow. + """ is_max = s == s.max() return ['background-color: yellow' if v else '' for v in is_max] res = res.set_index("Name") res.style.apply(highlight_max, axis=1) return res -import statistics -def get_box_plot(data, task_id, metric): - """ - Plots the boxplot of variance + +def get_box_plot(results, task_id, metric): + """ Generates a box plot of the variance in the result. + # Arguments + results: Results of various runs using AutoML systems + task_id: Id for OpenML task flow + metric: Score metric considered for the box-plot + # Returns + None + Builds and displays the box plot showing the variance in results for the AutoML performance on the given dataset. """ - auto_sklearn = list(data.loc[(task_id, "autosklearn")][metric]) - auto_kaggle = list(data.loc[(task_id, "autokaggle")][metric]) + auto_sklearn = list(results.loc[(task_id, "autosklearn")][metric]) + auto_kaggle = list(results.loc[(task_id, "autokaggle")][metric]) med_sk = statistics.median(auto_sklearn) med_ak = statistics.median(auto_kaggle) while len(auto_sklearn) < len(auto_kaggle): @@ -290,8 +407,10 @@ def get_box_plot(data, task_id, metric): # ak.run_automation(classification_task_list) # t2 = time.time() # print(t2-t1) + np.random.seed(1001) + random.seed(1001) import time t1 = time.time() - ak.evaluate(31) + ak.evaluate(3021) t2 = time.time() print(t2-t1) diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py index fa8638d..e5d3b6f 100644 --- a/examples/tabular_classification_binary.py +++ b/examples/tabular_classification_binary.py @@ -19,7 +19,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggleClassifier() + clf = Classifier() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py index 109d361..3426dd3 100644 --- a/examples/tabular_classification_multiclass.py +++ b/examples/tabular_classification_multiclass.py @@ -18,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggleClassifier() + clf = Classifier() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index ec5af7a..bf97bc7 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -18,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = AutoKaggleRegressor() + clf = Regressor() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) From 7e651e67348a55a7409c5bad81dc7b90f95a5810 Mon Sep 17 00:00:00 2001 From: Anurag Kapale Date: Thu, 26 Sep 2019 18:59:34 -0500 Subject: [PATCH 31/31] Fix Indent --- autokaggle/auto_ml.py | 278 ++++++++++++++------- autokaggle/config.py | 95 +++++--- autokaggle/ensemblers.py | 82 ++++--- autokaggle/preprocessor.py | 478 ++++++++++++++++++++++++------------- autokaggle/utils.py | 13 +- examples/benchmarking.py | 150 +++++++----- 6 files changed, 715 insertions(+), 381 deletions(-) diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py index 4c2dc47..3cfef69 100644 --- a/autokaggle/auto_ml.py +++ b/autokaggle/auto_ml.py @@ -8,10 +8,13 @@ from joblib import dump, load from autokaggle.preprocessor import Preprocessor -from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, \ + read_json from lightgbm import LGBMClassifier, LGBMRegressor -from autokaggle.config import Config, CLASSIFICATION_PREP_HPARAM_SPACE, REGRESSION_PREP_HPARAM_SPACE, \ - REGRESSION_BASE_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, REGRESSION_HPARAM_SPACE +from autokaggle.config import Config, CLASSIFICATION_PREP_HPARAM_SPACE, \ + REGRESSION_PREP_HPARAM_SPACE, \ + REGRESSION_BASE_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE, \ + CLASSIFICATION_HPARAM_SPACE, REGRESSION_HPARAM_SPACE from sklearn.model_selection import StratifiedKFold, KFold import hyperopt from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL @@ -24,45 +27,63 @@ class AutoKaggle(BaseEstimator): """ Automated Machine Learning system class. - AutoKaggle implements an end to end automated ML system. It initiates and searches for the optimum ML pipeline. - The user can use it with the simple `fit()` and `predict()` methods like Sci-kit learn estimators. - The user can specify various parameters controlling different components of the system. + AutoKaggle implements an end to end automated ML system. It initiates and + searches for the optimum ML pipeline. The user can use it with the simple + `fit()` and `predict()` methods like Sci-kit learn estimators. + The user can specify various parameters controlling different components + of the system. # Arguments path: String. OS path for storing temporary model parameters. verbose: Bool. Defines the verbosity of the logging. time_limit: Int. Time budget for performing search and fit pipeline. use_ensembling: Bool. Defines whether to use an ensemble of models - num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble + num_estimators_ensemble: Int. Maximum number of estimators to be used + in an ensemble ensemble_strategy: String. Strategy to ensemble models - ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling - random_ensemble: Bool. Whether the ensembling estimators are picked randomly. - diverse_ensemble: Bool. Whether estimators from different families are picked. - ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search + ensemble_method: String. Aggregation method if ensemble_strategy is + set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked + randomly. + diverse_ensemble: Bool. Whether estimators from different families are + picked. + ensembling_search_iter: Int. Search iterations for ensembling + hyper-parameter search search_algo: String. Search strategy for hyper-parameter search. search_iter: Int. Number of iterations used for hyper-parameter search. cv_folds: Int. Number of Cross Validation folds. - subsample_ratio: Percent of subsample used for for hyper-parameter search. + subsample_ratio: Percent of subsample used for for hyper-parameter + search. data_info: list(String). Lists the datatypes of each feature column. - stack_probabilities: Bool. Whether to use class probabilities in ensembling. + stack_probabilities: Bool. Whether to use class probabilities in + ensembling. upsample_classes: Bool. Whether to upsample less represented classes num_p_hparams: Int. Number of preprocessor search spaces. """ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, - num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting', - search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, - stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10, + num_estimators_ensemble=50, ensemble_strategy='stacking', + ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, + random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, + ensembling_search_iter=10, search_algo='random', num_p_hparams=10): self.is_trained = False if not path: path = rand_temp_folder_generator() - self.config = Config(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling, - num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy, - ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds, - subsample_ratio=subsample_ratio, random_ensemble=random_ensemble, - diverse_ensemble=diverse_ensemble, stack_probabilities=stack_probabilities, + self.config = Config(path=path, verbose=verbose, time_limit=time_limit, + use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, + ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, + search_iter=search_iter, cv_folds=cv_folds, + subsample_ratio=subsample_ratio, + random_ensemble=random_ensemble, + diverse_ensemble=diverse_ensemble, + stack_probabilities=stack_probabilities, data_info=data_info, upsample_classes=upsample_classes, - ensembling_search_iter=ensembling_search_iter, search_algo=search_algo, + ensembling_search_iter=ensembling_search_iter, + search_algo=search_algo, num_p_hparams=num_p_hparams) self.pipeline = None self.m_hparams = None @@ -75,20 +96,25 @@ def fit(self, x, y, time_limit=None, data_info=None): x: A numpy.ndarray instance containing the training data. y: training label vector. time_limit: remaining time budget. - data_info: meta-features of the dataset, which is an numpy.ndarray describing the feature type of each - column in raw_x. The feature type include: 'TIME' for temporal feature, 'NUM' for other numerical feature, + data_info: meta-features of the dataset, which is an numpy.ndarray + describing the feature type of each + column in raw_x. The feature type include: 'TIME' for temporal + feature, 'NUM' for other numerical feature, and 'CAT' for categorical feature. # Returns None Both inputs X and y are numpy arrays. - If fit is called multiple times on incremental data (train, test1, test2, etc.) - you should warm-start your training from the pre-trained model. Past data will + If fit is called multiple times on incremental data (train, test1, test2, + etc.) + you should warm-start your training from the pre-trained model. Past data + will NOT be available for re-training. """ self.config.time_limit = time_limit if time_limit else 24 * 60 * 60 # Extract or read data info - self.config.data_info = data_info if data_info is not None else self.extract_data_info(x) + self.config.data_info = data_info if data_info is not None else \ + self.extract_data_info(x) if self.config.verbose: print('DATA_INFO: {}'.format(self.config.data_info)) @@ -130,7 +156,8 @@ def predict(self, x_test): return y def predict_proba(self, x_test): - """ Predict label probabilities on the test data for the given classification task. + """ Predict label probabilities on the test data for the given + classification task. # Arguments x_test: A numpy.ndarray instance containing the test data. # Returns @@ -143,7 +170,8 @@ def predict_proba(self, x_test): return y def evaluate(self, x_test, y_test): - """ Predict label probabilities on the test data for the given classification task. + """ Predict label probabilities on the test data for the given + classification task. # Arguments x_test: A numpy.ndarray instance containing the training data. y_test: A numpy array with ground truth labels for the test data @@ -171,7 +199,8 @@ def resample(self, x, y): Up-sampled version of the dataset """ if self.config.upsample_classes: - x, y = SMOTE(sampling_strategy=self.config.resampling_strategy).fit_resample(x, y) + x, y = SMOTE( + sampling_strategy=self.config.resampling_strategy).fit_resample(x, y) while x.shape[0] < 60: x = np.concatenate([x, x], axis=0) y = np.concatenate([y, y], axis=0) @@ -207,16 +236,19 @@ def search(self, x, y, prep_space, model_space): # Returns List of hyper-parameter trials """ - grid_train_x, grid_train_y = self.subsample(x, y, sample_percent=self.config.subsample_ratio) + grid_train_x, grid_train_y = \ + self.subsample(x, y, sample_percent=self.config.subsample_ratio) score_metric, skf = self.get_skf(self.config.cv_folds) def objective_func(params): model_class = params['estimator']['model'] m_params = params['estimator']['param'] p_params = params['prep'] - pipeline = AutoPipe(model_class=model_class, m_params=m_params, p_params=p_params, config=self.config) + pipeline = AutoPipe(model_class=model_class, m_params=m_params, + p_params=p_params, config=self.config) try: - eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y, scoring=score_metric, cv=skf).mean() + eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y, + scoring=score_metric, cv=skf).mean() status = STATUS_OK except ValueError as e: print(e) @@ -226,13 +258,16 @@ def objective_func(params): print("CV Score:", eval_score) print("\n=================") loss = 1 - eval_score if status == STATUS_OK else float('inf') - return {'loss': loss, 'status': status, 'model_class': model_class, 'm_params': m_params, + return {'loss': loss, 'status': status, 'model_class': model_class, + 'm_params': m_params, 'p_params': p_params} trials = Trials() search_space = {'prep': prep_space, 'estimator': model_space} - _ = fmin(objective_func, search_space, algo=self.config.search_algo, trials=trials, - max_evals=self.config.search_iter, rstate=np.random.RandomState(self.config.random_state)) + _ = fmin(objective_func, search_space, algo=self.config.search_algo, + trials=trials, + max_evals=self.config.search_iter, + rstate=np.random.RandomState(self.config.random_state)) return trials def get_best_pipeline(self, trials): @@ -246,7 +281,8 @@ def get_best_pipeline(self, trials): best_pipeline = self.setup_ensemble(trials) else: opt = trials.best_trial['result'] - best_pipeline = AutoPipe(opt['model_class'], opt['m_params'], opt['p_params'], self.config) + best_pipeline = AutoPipe(opt['model_class'], opt['m_params'], + opt['p_params'], self.config) if self.config.verbose: print("The best hyperparameter setting found:") print(opt) @@ -288,7 +324,8 @@ def pick_diverse_estimators(self, trial_list): # Arguments trial_list: List of the hyper-parameter search trials. # Returns - List of top hyper-parameter spaces equally selected from each estimator family. + List of top hyper-parameter spaces equally selected from each + estimator family. """ groups = collections.defaultdict(list) @@ -299,7 +336,8 @@ def pick_diverse_estimators(self, trial_list): while idx < self.config.num_estimators_ensemble: for grp in groups.values(): if j < len(grp): - est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'], grp[j]['p_params'], self.config) + est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'], + grp[j]['p_params'], self.config) estimator_list.append(est) idx += 1 j += 1 @@ -310,13 +348,15 @@ def setup_ensemble(self, trials): # Arguments trials: List of the hyper-parameter search trials. # Returns - An ensembling estimator to be trained using the base estimators picked from trials. + An ensembling estimator to be trained using the base estimators picked + from trials. """ # Filter the unsuccessful hparam spaces i.e. 'loss' == float('inf') best_trials = [t for t in trials.results if t['loss'] != float('inf')] best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) - self.config.num_estimators_ensemble = min(self.config.num_estimators_ensemble, len(best_trials)) + self.config.num_estimators_ensemble = min( + self.config.num_estimators_ensemble, len(best_trials)) if self.config.random_ensemble: np.random.shuffle(best_trials) @@ -326,20 +366,25 @@ def setup_ensemble(self, trials): else: estimator_list = [] for i in range(self.config.num_estimators_ensemble): - est = AutoPipe(best_trials[i]['model_class'], best_trials[i]['m_params'], best_trials[i]['p_params'], + est = AutoPipe(best_trials[i]['model_class'], + best_trials[i]['m_params'], + best_trials[i]['p_params'], self.config) estimator_list.append(est) if self.config.ensemble_strategy == 'stacking': - best_estimator_ = StackedEnsemblingModel(estimator_list, config=self.config) + best_estimator_ = StackedEnsemblingModel(estimator_list, + config=self.config) else: - best_estimator_ = RankedEnsemblingModel(estimator_list, config=self.config) + best_estimator_ = RankedEnsemblingModel(estimator_list, + config=self.config) return best_estimator_ @staticmethod def extract_data_info(raw_x): """ - Extracts the data info automatically based on the type of each feature in raw_x. + Extracts the data info automatically based on the type of each feature in + raw_x. # Arguments raw_x: a numpy.ndarray instance containing the training data. # Returns @@ -365,36 +410,55 @@ class Classifier(AutoKaggle): verbose: Bool. Defines the verbosity of the logging. time_limit: Int. Time budget for performing search and fit pipeline. use_ensembling: Bool. Defines whether to use an ensemble of models - num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble + num_estimators_ensemble: Int. Maximum number of estimators to be used + in an ensemble ensemble_strategy: String. Strategy to ensemble models - ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling - random_ensemble: Bool. Whether the ensembling estimators are picked randomly. - diverse_ensemble: Bool. Whether estimators from different families are picked. - ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search + ensemble_method: String. Aggregation method if ensemble_strategy is + set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked + randomly. + diverse_ensemble: Bool. Whether estimators from different families are + picked. + ensembling_search_iter: Int. Search iterations for ensembling + hyper-parameter search search_algo: String. Search strategy for hyper-parameter search. search_iter: Int. Number of iterations used for hyper-parameter search. cv_folds: Int. Number of Cross Validation folds. - subsample_ratio: Percent of subsample used for for hyper-parameter search. + subsample_ratio: Percent of subsample used for for hyper-parameter + search. data_info: list(String). Lists the datatypes of each feature column. - stack_probabilities: Bool. Whether to use class probabilities in ensembling. + stack_probabilities: Bool. Whether to use class probabilities in + ensembling. upsample_classes: Bool. Whether to upsample less represented classes num_p_hparams: Int. Number of preprocessor search spaces. """ + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, - num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting', - search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, - stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10, + num_estimators_ensemble=50, ensemble_strategy='stacking', + ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, + random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, + ensembling_search_iter=10, search_algo='random', num_p_hparams=10): - super().__init__(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling, - num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy, - ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds, - subsample_ratio=subsample_ratio, random_ensemble=random_ensemble, diverse_ensemble=diverse_ensemble, - stack_probabilities=stack_probabilities, data_info=data_info, - upsample_classes=upsample_classes, ensembling_search_iter=ensembling_search_iter, + super().__init__(path=path, verbose=verbose, time_limit=time_limit, + use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, + ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, search_iter=search_iter, + cv_folds=cv_folds, + subsample_ratio=subsample_ratio, + random_ensemble=random_ensemble, + diverse_ensemble=diverse_ensemble, + stack_probabilities=stack_probabilities, + data_info=data_info, + upsample_classes=upsample_classes, + ensembling_search_iter=ensembling_search_iter, search_algo=search_algo, num_p_hparams=num_p_hparams) self.config.objective = 'classification' - self.m_hparams = hp.choice('classifier', [CLASSIFICATION_HPARAM_SPACE[m] for m in - self.config.classification_models]) + self.m_hparams = hp.choice('classifier', + [CLASSIFICATION_HPARAM_SPACE[m] for m in + self.config.classification_models]) self.m_hparams_base = hp.choice('classifier', [CLASSIFICATION_BASE_HPARAM_SPACE[m] for m in self.config.classification_models]) @@ -406,10 +470,12 @@ def get_skf(self, folds): """ if self.config.objective == 'binary': score_metric = 'roc_auc' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + skf = StratifiedKFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) else: score_metric = 'f1_weighted' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + skf = StratifiedKFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) return score_metric, skf @@ -422,63 +488,89 @@ class Regressor(AutoKaggle): verbose: Bool. Defines the verbosity of the logging. time_limit: Int. Time budget for performing search and fit pipeline. use_ensembling: Bool. Defines whether to use an ensemble of models - num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble + num_estimators_ensemble: Int. Maximum number of estimators to be used + in an ensemble ensemble_strategy: String. Strategy to ensemble models - ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling - random_ensemble: Bool. Whether the ensembling estimators are picked randomly. - diverse_ensemble: Bool. Whether estimators from different families are picked. - ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search + ensemble_method: String. Aggregation method if ensemble_strategy is + set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked + randomly. + diverse_ensemble: Bool. Whether estimators from different families are + picked. + ensembling_search_iter: Int. Search iterations for ensembling + hyper-parameter search search_algo: String. Search strategy for hyper-parameter search. search_iter: Int. Number of iterations used for hyper-parameter search. cv_folds: Int. Number of Cross Validation folds. - subsample_ratio: Percent of subsample used for for hyper-parameter search. + subsample_ratio: Percent of subsample used for for hyper-parameter + search. data_info: list(String). Lists the datatypes of each feature column. - stack_probabilities: Bool. Whether to use class probabilities in ensembling. + stack_probabilities: Bool. Whether to use class probabilities in + ensembling. upsample_classes: Bool. Whether to upsample less represented classes num_p_hparams: Int. Number of preprocessor search spaces. """ + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, - num_estimators_ensemble=50, ensemble_strategy='stacking', ensemble_method='max_voting', - search_iter=500, cv_folds=3, subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, - stack_probabilities=False, data_info=None, upsample_classes=False, ensembling_search_iter=10, + num_estimators_ensemble=50, ensemble_strategy='stacking', + ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, + random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, + ensembling_search_iter=10, search_algo='random', num_p_hparams=10): - super().__init__(path=path, verbose=verbose, time_limit=time_limit, use_ensembling=use_ensembling, - num_estimators_ensemble=num_estimators_ensemble, ensemble_strategy=ensemble_strategy, - ensemble_method=ensemble_method, search_iter=search_iter, cv_folds=cv_folds, - subsample_ratio=subsample_ratio, random_ensemble=random_ensemble, + super().__init__(path=path, verbose=verbose, time_limit=time_limit, + use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, + ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, search_iter=search_iter, + cv_folds=cv_folds, + subsample_ratio=subsample_ratio, + random_ensemble=random_ensemble, diverse_ensemble=diverse_ensemble, - stack_probabilities=stack_probabilities, data_info=data_info, - upsample_classes=upsample_classes, ensembling_search_iter=ensembling_search_iter, + stack_probabilities=stack_probabilities, + data_info=data_info, + upsample_classes=upsample_classes, + ensembling_search_iter=ensembling_search_iter, search_algo=search_algo, num_p_hparams=num_p_hparams) self.config.objective = 'regression' - self.m_hparams = hp.choice('regressor', [REGRESSION_HPARAM_SPACE[m] for m in self.config.regression_models]) + self.m_hparams = hp.choice('regressor', [REGRESSION_HPARAM_SPACE[m] for m in + self.config.regression_models]) self.m_hparams_base = hp.choice('regressor', - [REGRESSION_BASE_HPARAM_SPACE[m] for m in self.config.classification_models]) + [REGRESSION_BASE_HPARAM_SPACE[m] for m in + self.config.classification_models]) self.p_hparams_base = REGRESSION_PREP_HPARAM_SPACE def get_skf(self, folds): """ See the base class. """ - return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) class AutoPipe(BaseEstimator): """ Implements a machine learning pipeline. - Implements a machine learning pipeline with preprocessor and estimator. A user can call fit(), and predict() - methods on it. It is used as a search unit in AutoKaggle's hyeper-parameter search. + Implements a machine learning pipeline with preprocessor and estimator. A + user can call fit(), and predict() methods on it. It is used as a search + unit in AutoKaggle's hyeper-parameter search. # Arguments - config: Config. Defines the configuration of various components of the pipeline. + config: Config. Defines the configuration of various components of the + pipeline. m_params: Dict. Hyper-parameter search space for estimator. p_params: Dict. Hyper-parameter search space for preprocessor. model_class: Estimator. Class name of the estimator used in the pipeline. - _estimator_type: String. Denotes if the estimator is 'classifier' or 'regressor' - prep: Preprocessor. Instance of the Preprocessor class, which does basic feature preprocessing and feature + _estimator_type: String. Denotes if the estimator is 'classifier' or + 'regressor' + prep: Preprocessor. Instance of the Preprocessor class, which does + basic feature preprocessing and feature engineering - model: Estimator. Instance of the estimator class which learns a machine learning model and predicts on the + model: Estimator. Instance of the estimator class which learns a + machine learning model and predicts on the given data. """ + def __init__(self, model_class, m_params, p_params, config): self.prep = None self.model = None @@ -486,7 +578,8 @@ def __init__(self, model_class, m_params, p_params, config): self.m_params = m_params self.p_params = p_params self.model_class = model_class - self._estimator_type = 'classifier' if is_classifier(model_class) else 'regressor' + self._estimator_type = 'classifier' if is_classifier( + model_class) else 'regressor' def fit(self, x, y): """ Trains the given pipeline. @@ -513,7 +606,8 @@ def predict(self, x): return self.model.predict(x) def predict_proba(self, x): - """ Predict label probabilities on the test data for the given classification task. + """ Predict label probabilities on the test data for the given + classification task. # Arguments x: A numpy.ndarray instance containing the test data. # Returns diff --git a/autokaggle/config.py b/autokaggle/config.py index 87a4f30..729c156 100644 --- a/autokaggle/config.py +++ b/autokaggle/config.py @@ -4,7 +4,8 @@ from hyperopt import hp from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier -from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, AdaBoostRegressor,\ +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \ + RandomForestRegressor, AdaBoostRegressor, \ ExtraTreesRegressor from sklearn.linear_model import Ridge from lightgbm import LGBMClassifier, LGBMRegressor @@ -15,33 +16,45 @@ class Config: """ Configuration for various autoML components. - Defines the common configuration of different auto ML components. It is shared between AutoKaggle, AutoPipe, - Preprocessor and Ensembling class. + Defines the common configuration of different auto ML components. It is + shared between AutoKaggle, AutoPipe, Preprocessor and Ensembling class. # Arguments path: String. OS path for storing temporary model parameters. verbose: Bool. Defines the verbosity of the logging. time_limit: Int. Time budget for performing search and fit pipeline. use_ensembling: Bool. Defines whether to use an ensemble of models - num_estimators_ensemble: Int. Maximum number of estimators to be used in an ensemble + num_estimators_ensemble: Int. Maximum number of estimators to be used + in an ensemble ensemble_strategy: String. Strategy to ensemble models - ensemble_method: String. Aggregation method if ensemble_strategy is set to ranked_ensembling - random_ensemble: Bool. Whether the ensembling estimators are picked randomly. - diverse_ensemble: Bool. Whether estimators from different families are picked. - ensembling_search_iter: Int. Search iterations for ensembling hyper-parameter search + ensemble_method: String. Aggregation method if ensemble_strategy is + set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked + randomly. + diverse_ensemble: Bool. Whether estimators from different families are + picked. + ensembling_search_iter: Int. Search iterations for ensembling + hyper-parameter search search_algo: String. Search strategy for hyper-parameter search. search_iter: Int. Number of iterations used for hyper-parameter search. cv_folds: Int. Number of Cross Validation folds. - subsample_ratio: Percent of subsample used for for hyper-parameter search. + subsample_ratio: Percent of subsample used for for hyper-parameter + search. data_info: list(String). Lists the datatypes of each feature column. - stack_probabilities: Bool. Whether to use class probabilities in ensembling. + stack_probabilities: Bool. Whether to use class probabilities in + ensembling. upsample_classes: Bool. Whether to upsample less represented classes num_p_hparams: Int. Number of preprocessor search spaces. """ - def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, num_estimators_ensemble=50, - ensemble_strategy='stacking', ensemble_method='max_voting', search_iter=500, cv_folds=3, - subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, stack_probabilities=False, - data_info=None, upsample_classes=False, ensembling_search_iter=10, search_algo='random', + + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, + num_estimators_ensemble=50, + ensemble_strategy='stacking', ensemble_method='max_voting', + search_iter=500, cv_folds=3, + subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, + data_info=None, upsample_classes=False, ensembling_search_iter=10, + search_algo='random', num_p_hparams=10): self.verbose = verbose self.path = path if path is not None else rand_temp_folder_generator() @@ -61,15 +74,18 @@ def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True self.subsample_ratio = subsample_ratio self.resampling_strategy = 'auto' self.random_state = 1001 - self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest', 'adaboost'] + self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest', + 'adaboost'] # self.classification_models = ['knn', 'lgbm', 'random_forest',] - self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', 'adaboost', 'catboost'] + self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', + 'adaboost', 'catboost'] self.diverse_ensemble = diverse_ensemble self.stack_probabilities = stack_probabilities self.data_info = data_info self.upsample_classes = upsample_classes self.ensembling_search_iter = ensembling_search_iter - self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else hyperopt.tpe.suggest + self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else \ + hyperopt.tpe.suggest self.num_p_hparams = num_p_hparams def update(self, options): @@ -81,7 +97,8 @@ def update(self, options): KNN_CLASSIFIER_PARAMS = { 'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]), 'weights': hp.choice('weight_knn', ['uniform', 'distance']), - 'metric': hp.choice('metric_knn', ["euclidean", "manhattan", "chebyshev", "minkowski"]), + 'metric': hp.choice('metric_knn', + ["euclidean", "manhattan", "chebyshev", "minkowski"]), 'p': hp.choice('p_knn', range(1, 3)), } @@ -110,20 +127,24 @@ def update(self, options): 'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)), 'max_depth': hp.choice('max_depth_lgbm', range(5, 10)), 'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)), - 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2), high=np.log(2)), + 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2), + high=np.log(2)), } ADABOOST_CLASSIFIER_PARAMS = { 'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)), - 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)), + 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), + high=np.log(2)), } CATBOOST_CLASSIFIER_PARAMS = { 'iterations': hp.choice('iterations_catboost', [5, 10]), 'depth': hp.choice('depth_catboost', range(4, 11)), - 'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), high=np.log(1)), - 'loss_function': hp.choice('loss_function_catboost', ['Logloss', 'CrossEntropy']), + 'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), + high=np.log(1)), + 'loss_function': hp.choice('loss_function_catboost', + ['Logloss', 'CrossEntropy']), 'verbose': True, 'leaf_estimation_iterations': 10, 'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3)) @@ -163,13 +184,15 @@ def update(self, options): 'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)), 'max_depth': hp.choice('max_depth_lgbm', range(5, 10)), 'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)), - 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5), high=np.log(1)), + 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5), + high=np.log(1)), } ADABOOST_REGRESSOR_PARAMS = { 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)), - 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), high=np.log(2)), + 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), + high=np.log(2)), # 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)), } @@ -181,7 +204,6 @@ def update(self, options): 'verbose': True } - REGRESSION_HPARAM_SPACE = { 'extratree': { 'model': ExtraTreesRegressor, @@ -202,14 +224,13 @@ def update(self, options): 'adaboost': { 'model': AdaBoostRegressor, 'param': ADABOOST_REGRESSOR_PARAMS - }, + }, 'catboost': { 'model': CatBoostRegressor, 'param': CATBOOST_REGRESSOR_PARAMS } } - CLASSIFICATION_HPARAM_SPACE = { 'knn': { 'model': KNeighborsClassifier, @@ -284,7 +305,7 @@ def update(self, options): 'adaboost': { 'model': AdaBoostRegressor, 'param': {} - }, + }, 'catboost': { 'model': CatBoostRegressor, 'param': {} @@ -292,7 +313,8 @@ def update(self, options): } REGRESSION_PREP_HPARAM_SPACE = { - 'cat_encoding': hp.choice('cat_enc', ['count', 'target+count', 'target+label', 'label']), + 'cat_encoding': hp.choice('cat_enc', + ['count', 'target+count', 'target+label', 'label']), 'scaling': hp.choice('scaling', [True, False]), 'log_transform': hp.choice('log_transform', [True, False]), 'power_transform': hp.choice('power_transform', [True, False]), @@ -300,15 +322,18 @@ def update(self, options): 'binning': hp.choice('binning', [True, False]), 'add_time_offset': hp.choice('add_time_offset', [True, False]), 'add_time_diff': hp.choice('add_time_diff', [True, False]), - # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', 'min', None]), + # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', + # 'min', None]), # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]), - 'imputation_strategy': hp.choice('imputation_strategy', ['most_frequent', 'zero']), + 'imputation_strategy': hp.choice('imputation_strategy', + ['most_frequent', 'zero']), 'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01), 'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01) } CLASSIFICATION_PREP_HPARAM_SPACE = { - 'cat_encoding': hp.choice('cat_enc', ['target', 'count', 'target+count', 'target+label']), + 'cat_encoding': hp.choice('cat_enc', + ['target', 'count', 'target+count', 'target+label']), 'scaling': hp.choice('scaling', [True, False]), 'log_transform': hp.choice('log_transform', [True, False]), 'power_transform': hp.choice('power_transform', [True, False]), @@ -316,9 +341,11 @@ def update(self, options): 'binning': hp.choice('binning', [True, False]), 'add_time_offset': hp.choice('add_time_offset', [True, False]), 'add_time_diff': hp.choice('add_time_diff', [True, False]), - # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', 'min', None]), + # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', + # 'min', None]), # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]), - 'imputation_strategy': hp.choice('imputation_strategy', ['most_frequent', 'zero']), + 'imputation_strategy': hp.choice('imputation_strategy', + ['most_frequent', 'zero']), 'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01), 'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01) } diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py index fa394d2..95d0435 100644 --- a/autokaggle/ensemblers.py +++ b/autokaggle/ensemblers.py @@ -1,5 +1,6 @@ from sklearn.base import BaseEstimator -from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, read_json +from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, \ + read_json from abc import abstractmethod import numpy as np import os @@ -17,26 +18,31 @@ from sklearn.model_selection import RandomizedSearchCV, cross_val_score import hyperopt from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK -from autokaggle.config import REGRESSION_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE,\ +from autokaggle.config import REGRESSION_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, \ + CLASSIFICATION_BASE_HPARAM_SPACE, \ REGRESSION_BASE_HPARAM_SPACE class EnsemblingModel: """ Base class for ensembling estimators. - This class creates an ensembling estimator from a given list of estimators. The user can call fit() and - predict() methods, similar to the scikit-learn estimators. + This class creates an ensembling estimator from a given list of estimators. + The user can call fit() and predict() methods, similar to the scikit-learn + estimators. # Arguments - config: Config. Defines the configuration of various components of the autoML pipeline. - estimator_list: List. List of the estimators, to be used for building an ensemble. + config: Config. Defines the configuration of various components of the + autoML pipeline. + estimator_list: List. List of the estimators, to be used for building an + ensemble. """ + def __init__(self, estimator_list, config): self.config = config self.estimator_list = estimator_list @abstractmethod - def fit(self, X, y): + def fit(self, x, y): """ Trains the ensemble of estimators on the training data. # Arguments X: A numpy array instance containing the training data. @@ -46,13 +52,14 @@ def fit(self, X, y): pass @abstractmethod - def predict(self, X): + def predict(self, x): """ Generate prediction on the test data for the given task. # Arguments X: A numpy array instance containing the test data. # Returns A numpy array for the predictions on the x_test. - This function provides predicts on the input data using the ensemble of estimators. + This function provides predicts on the input data using the ensemble of + estimators. """ pass @@ -60,17 +67,18 @@ def predict(self, X): class RankedEnsemblingModel(EnsemblingModel): """ Implements ensembling using ranking based methods. - This class implements randing based ensembling using ensembling methods amongst: ('mean', 'median', 'max' and - 'majority_voting') + This class implements randing based ensembling using ensembling methods + amongst: ('mean', 'median', 'max' and 'majority_voting') """ - def fit(self, X, y): + + def fit(self, x, y): for est in self.estimator_list: - est.fit(X, y) - - def predict(self, X): - predictions = np.zeros((len(X), len(self.estimator_list))) + est.fit(x, y) + + def predict(self, x): + predictions = np.zeros((len(x), len(self.estimator_list))) for i, est in enumerate(self.estimator_list): - predictions[:, i] = est.predict(X) + predictions[:, i] = est.predict(x) if self.config.ensemble_method == 'median': return np.median(predictions, axis=1) @@ -87,21 +95,25 @@ def predict(self, X): class StackedEnsemblingModel(EnsemblingModel): """ Implements a stacking based ensembling estimator. - This class creates an ensembling estimator using stacking. It trains an Light-GBM model on the predictions of - thh base estimator. + This class creates an ensembling estimator using stacking. It trains an + Light-GBM model on the predictions of the base estimator. # Arguments - stacking_estimator: LightGBM estimator. Meta-learning algorithm for the stacking estimator. + stacking_estimator: LightGBM estimator. Meta-learning algorithm for the + stacking estimator. """ + def __init__(self, estimator_list, config): super().__init__(estimator_list, config) self.stacking_estimator = None if self.config.objective == 'regression': - self.hparams = hp.choice('regressor', [REGRESSION_BASE_HPARAM_SPACE['lgbm']]) + self.hparams = hp.choice('regressor', + [REGRESSION_BASE_HPARAM_SPACE['lgbm']]) self.config.stack_probabilities = False else: - self.hparams = hp.choice('classifier', [CLASSIFICATION_BASE_HPARAM_SPACE['lgbm']]) + self.hparams = hp.choice('classifier', + [CLASSIFICATION_BASE_HPARAM_SPACE['lgbm']]) def get_model_predictions(self, X): """ Generate the combined predictions from the list of the estimators. @@ -126,8 +138,8 @@ def get_model_predictions(self, X): predictions[:, i] = est.predict(X) return predictions - def fit(self, X, y): - x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2) + def fit(self, x, y): + x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2) for est in self.estimator_list: est.fit(x_train, y_train) predictions = self.get_model_predictions(x_val) @@ -146,7 +158,8 @@ def search(self, x, y): def objective_func(args): clf = args['model'](**args['param']) try: - eval_score = cross_val_score(clf, x, y, scoring=score_metric, cv=skf).mean() + eval_score = cross_val_score(clf, x, y, scoring=score_metric, + cv=skf).mean() except ValueError: eval_score = 0 if self.config.verbose: @@ -155,8 +168,10 @@ def objective_func(args): return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} trials = Trials() - best = fmin(objective_func, self.hparams, algo=self.config.search_algo, trials=trials, - max_evals=self.config.ensembling_search_iter, rstate=np.random.RandomState(self.config.random_state)) + best = fmin(objective_func, self.hparams, algo=self.config.search_algo, + trials=trials, + max_evals=self.config.ensembling_search_iter, + rstate=np.random.RandomState(self.config.random_state)) opt = space_eval(self.hparams, best) best_estimator_ = opt['model'](**opt['param']) @@ -165,8 +180,8 @@ def objective_func(args): print(opt) return best_estimator_ - def predict(self, X): - predictions = self.get_model_predictions(X) + def predict(self, x): + predictions = self.get_model_predictions(x) return self.stacking_estimator.predict(predictions) def get_skf(self, folds): @@ -178,13 +193,16 @@ def get_skf(self, folds): """ if self.config.objective == 'binary': score_metric = 'roc_auc' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + skf = StratifiedKFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) elif self.config.objective == 'multiclass': score_metric = 'f1_weighted' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + skf = StratifiedKFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) elif self.config.objective == 'regression': score_metric = 'neg_mean_squared_error' - skf = KFold(n_splits=folds, shuffle=True, random_state=self.config.random_state) + skf = KFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) else: ValueError("Invalid objective") return score_metric, skf diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py index d286dd9..f5ba361 100644 --- a/autokaggle/preprocessor.py +++ b/autokaggle/preprocessor.py @@ -4,7 +4,8 @@ import itertools from scipy.stats import pearsonr from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler, PowerTransformer, KBinsDiscretizer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, PowerTransformer, \ + KBinsDiscretizer, OneHotEncoder from sklearn.base import TransformerMixin from sklearn.base import BaseEstimator from sklearn.impute import SimpleImputer @@ -12,20 +13,25 @@ from abc import abstractmethod import collections from lightgbm import LGBMClassifier, LGBMRegressor + LEVEL_HIGH = 32 class Preprocessor(TransformerMixin): """ Implements basic preprocessing and feature engineering class. - Preprocessor takes care of the basic preprocessing and feature engineering of the input data. Similar to - Scikit-learn transformers, it implements the fit() and transform() methods. TO acheive this It applies various - feature primitives in a sequence using scikit-learn pipeline. + Preprocessor takes care of the basic preprocessing and feature engineering of + the input data. Similar to Scikit-learn transformers,it implements the fit() + and transform() methods. TO acheive this It applies various feature + primitives in a sequence using scikit-learn pipeline. # Arguments - config: Config. Defines the configuration of various components of the autoML pipeline. + config: Config. Defines the configuration of various components of the + AutoML pipeline. params: Dict. Hyper-parameter search space for preprocessor. - pipeline: Pipeline. Sci-kit learn pipeline class to apply the feature primitives in sequence + pipeline: Pipeline. Sci-kit learn pipeline class to apply the feature + primitives in sequence """ + def __init__(self, config, params): self.config = config self.params = params @@ -61,8 +67,8 @@ def transform(self, raw_x): raw_x: a numpy array instance containing the training/testing data # Returns A numpy array instance containing the transformed data. - This function provides transforms the input data by applying the transformations using the pre-trained - preprocessor chain. + This function provides transforms the input data by applying the + transformations using the pre-trained preprocessor chain. """ # Get Meta-Feature data = TabularData(raw_x, self.config.data_info, self.config.verbose) @@ -80,19 +86,30 @@ def get_categorical_pipeline(params): choice = params.get('cat_encoding', 'target') cat_pipeline = [] if choice == 'target': - cat_pipeline.append(('target_encoder', TargetEncoder(operation='upd', selected_type='CAT'))) + cat_pipeline.append(('target_encoder', TargetEncoder(operation='upd', + selected_type='CAT') + )) elif choice == 'label': - cat_pipeline.append(('label_encoder', LabelEncode(operation='upd', selected_type='CAT'))) + cat_pipeline.append( + ('label_encoder', LabelEncode(operation='upd', selected_type='CAT'))) elif choice == 'count': - cat_pipeline.append(('count_encoder', CatCount(operation='upd', selected_type='CAT'))) + cat_pipeline.append( + ('count_encoder', CatCount(operation='upd', selected_type='CAT'))) elif choice == 'target+count': - cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', selected_type='CAT'))) - cat_pipeline.append(('count_encoder', CatCount(operation='upd', selected_type='CAT'))) + cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', + selected_type='CAT') + )) + cat_pipeline.append( + ('count_encoder', CatCount(operation='upd', selected_type='CAT'))) elif choice == 'one_hot': - cat_pipeline.append(('one_hot_encoder', OneHot(operation='upd', selected_type='CAT'))) + cat_pipeline.append( + ('one_hot_encoder', OneHot(operation='upd', selected_type='CAT'))) elif choice == 'target+label': - cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', selected_type='CAT'))) - cat_pipeline.append(('label_encoder', LabelEncode(operation='upd', selected_type='CAT'))) + cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', + selected_type='CAT') + )) + cat_pipeline.append( + ('label_encoder', LabelEncode(operation='upd', selected_type='CAT'))) else: raise ValueError return cat_pipeline @@ -113,15 +130,21 @@ def get_numerical_pipeline(params): numeric_pipeline = [] if scaling: - numeric_pipeline.append(('scaler', TabScaler(operation='upd', selected_type='NUM'))) + numeric_pipeline.append( + ('scaler', TabScaler(operation='upd', selected_type='NUM'))) if log_transform: - numeric_pipeline.append(('log_transform', LogTransform(operation='upd', selected_type='NUM'))) + numeric_pipeline.append(('log_transform', + LogTransform(operation='upd', + selected_type='NUM'))) if power_transform: - numeric_pipeline.append(('boxcox', BoxCox(operation='upd', selected_type='NUM'))) + numeric_pipeline.append( + ('boxcox', BoxCox(operation='upd', selected_type='NUM'))) if pca: - numeric_pipeline.append(('pca', TabPCA(operation='add', selected_type='NUM'))) + numeric_pipeline.append( + ('pca', TabPCA(operation='add', selected_type='NUM'))) if binning: - numeric_pipeline.append(('binning', Binning(operation='add', selected_type='NUM'))) + numeric_pipeline.append( + ('binning', Binning(operation='add', selected_type='NUM'))) return numeric_pipeline def get_filtering_pipeline(self, params): @@ -134,15 +157,19 @@ def get_filtering_pipeline(self, params): pearson_thresh = params.get('pearson_thresh', 0) feat_importance_thresh = params.get('feat_importance_thresh', 0) - filter_pipeline = [('filter', FilterConstant(operation='del', selected_type='ALL'))] + filter_pipeline = [ + ('filter', FilterConstant(operation='del', selected_type='ALL'))] if pearson_thresh > 0: - filter_pipeline.append(('pearson_corr', FeatureFilter(operation='del', selected_type='ALL', - threshold=pearson_thresh))) + filter_pipeline.append( + ('pearson_corr', FeatureFilter(operation='del', selected_type='ALL', + threshold=pearson_thresh))) if feat_importance_thresh > 0: - filter_pipeline.append(('lgbm_feat_selection', FeatureImportance(operation='del', - selected_type='ALL', - threshold=feat_importance_thresh, - task_type=self.config.objective))) + filter_pipeline.append( + ('lgbm_feat_selection', + FeatureImportance(operation='del', + selected_type='ALL', + threshold=feat_importance_thresh, + task_type=self.config.objective))) return filter_pipeline @staticmethod @@ -157,9 +184,11 @@ def get_time_pipeline(params): add_diff = params.get('add_time_diff', False) time_pipeline = [] if add_offset: - time_pipeline.append(('time_offset', TimeOffset(operation='upd', selected_type='TIME'))) + time_pipeline.append( + ('time_offset', TimeOffset(operation='upd', selected_type='TIME'))) if add_diff: - time_pipeline.append(('time_diff', TimeDiff(operation='add', selected_type='TIME'))) + time_pipeline.append( + ('time_diff', TimeDiff(operation='add', selected_type='TIME'))) return time_pipeline @staticmethod @@ -171,7 +200,9 @@ def get_imputation_pipeline(params): List of primitives to be applied (based on the given setting) """ strategy = params.get('imputation_strategy', 'most_frequent') - impute_pipeline = [('imputer', Imputation(operation='upd', selected_type='ALL', strategy=strategy))] + impute_pipeline = [('imputer', + Imputation(operation='upd', selected_type='ALL', + strategy=strategy))] return impute_pipeline @staticmethod @@ -186,22 +217,27 @@ def get_higher_order_pipeline(params): cat_cat_strategy = params.get('cat_cat_strategy', None) pipeline = [] if cat_num_strategy: - pipeline.append(('cat_num_encoder', CatNumEncoder(operation='add', selected_type1='CAT', - selected_type2='NUM', strategy=cat_num_strategy))) + pipeline.append(('cat_num_encoder', + CatNumEncoder(operation='add', selected_type1='CAT', + selected_type2='NUM', + strategy=cat_num_strategy))) if cat_cat_strategy: - pipeline.append(('cat_cat_encoder', CatCatEncoder(operation='add', selected_type1='CAT', - selected_type2='CAT', strategy=cat_cat_strategy))) + pipeline.append(('cat_cat_encoder', + CatCatEncoder(operation='add', selected_type1='CAT', + selected_type2='CAT', + strategy=cat_cat_strategy))) return pipeline class TabularData: """ Represents the data and its meta-info. - TabularData includes the training/testing data along with its meta info such as data types, cardinality etc. The - user can update the data and its meta info as well as select the features matching the criteria. + TabularData includes the training/testing data along with its meta info such + as data types, cardinality etc. The user can update the data and its meta + info as well as select the features matching the criteria. # Arguments verbose: Bool. Determines the verbosity of the logging. - data_info: Dict. Dictionary mapping the feature names to their data_types. + data_info: Dict. Dictionary mapping the feature names to their data_types total_samples: Int. Number of samples in the data cat_col: List. List of the categorical features num_col: List. List of the numerical features @@ -209,11 +245,14 @@ class TabularData: n_cat: Int. Number of categorical features n_num: Int. Number of numerical features n_time: Int. Number of time features - cat_cardinality: Dict. Dictionary mapping categorical feature names of their cardinality (no. of unique - values) - generated_features: List. List of the newly added features. (In addition to the pre-existing columns) - num_info: Dict. Dictionary mapping numeircal column to their meta info such as range, std etc. + cat_cardinality: Dict. Dictionary mapping categorical feature names of + their cardinality (no. of unique values) + generated_features: List. List of the newly added features. (In + addition to the pre-existing columns) + num_info: Dict. Dictionary mapping numeircal column to their meta info + such as range, std etc. """ + def __init__(self, raw_x, data_info, verbose=True): self.cat_col = None self.num_col = None @@ -230,11 +269,13 @@ def __init__(self, raw_x, data_info, verbose=True): self.refresh_col_types() # Convert sparse to dense if needed - raw_x = raw_x.toarray() if type(raw_x) == scipy.sparse.csr.csr_matrix else raw_x + raw_x = raw_x.toarray() if type( + raw_x) == scipy.sparse.csr.csr_matrix else raw_x # To pandas Dataframe if type(raw_x) != pd.DataFrame: - raw_x = pd.DataFrame(raw_x, columns=[str(i) for i in range(raw_x.shape[1])]) + raw_x = pd.DataFrame(raw_x, + columns=[str(i) for i in range(raw_x.shape[1])]) self.X = raw_x # self.update_cat_cardinality() @@ -267,14 +308,17 @@ def rename_cols(self, key): # Arguments key: Identifier for renaming # Returns - Renaming function which takes current column name and outputs a new unique column name. + Renaming function which takes current column name and outputs a new + unique column name. """ + def rename_fn(col_name): col_name = str(col_name) col_name += '_' + key while col_name in self.X.columns: col_name += '_' + key return col_name + return rename_fn def update(self, operation, columns, x_tr, new_type=None, key=''): @@ -287,7 +331,8 @@ def update(self, operation, columns, x_tr, new_type=None, key=''): key: Name key for renaming the new columns # Returns None - This function takes the transformed (or generated) features after applying the primitive and updates the + This function takes the transformed (or generated) features after applying + the primitive and updates the TabularData. """ if operation == 'upd': @@ -314,7 +359,8 @@ def refresh_col_types(self): None # Returns None - This function updates the cat, num and time column lists based on (any) updates in the data_info. + This function updates the cat, num and time column lists based on (any) + updates in the data_info. """ self.cat_col = [k for k, v in self.data_info.items() if v == 'CAT'] self.num_col = [k for k, v in self.data_info.items() if v == 'NUM'] @@ -339,7 +385,8 @@ def update_cat_cardinality(self): def select_columns(self, data_type): """ Returns all the columns matching the input data_type # Arguments - data_type: Required type of the data (either of 'CAT', 'NUM', 'TIME' or 'ALL') + data_type: Required type of the data (either of 'CAT', 'NUM', 'TIME' or + 'ALL') # Returns List of the feature columns matching the input criteria. """ @@ -360,18 +407,26 @@ def select_columns(self, data_type): class Primitive(BaseEstimator, TransformerMixin): """ Base class for the single order data transformation function. - Primitive learns and applies the data transformation on a given set of features. The user can use fit() and - transform() functions to apply these transformations. + Primitive learns and applies the data transformation on a given set of + features. The user can use fit() and transform() functions to apply these + transformations. # Arguments options: Dict. Special arguments specific to the given primitive. - selected_type: 'String'. Specifies the type of features the transformation is supposed to be applied to. - operation: 'String'. Specifies the type of operation from 'add', 'update' or 'delete' - name_key : 'String'. Signature key to rename the column after applying the primitive. - selected: 'List'. List of the selected features, on which the transformation will be applied - drop_columns: 'List'. List of the features which would be dropped after applying the transformation. - supported_ops: Tuple. Specifies the allowed list of operations for this primitive. + selected_type: 'String'. Specifies the type of features the + transformation is supposed to be applied to. + operation: 'String'. Specifies the type of operation from 'add', 'update' + or 'delete' + name_key : 'String'. Signature key to rename the column after applying + the primitive. + selected: 'List'. List of the selected features, on which the + transformation will be applied + drop_columns: 'List'. List of the features which would be dropped after + applying the transformation. + supported_ops: Tuple. Specifies the allowed list of operations for this + primitive. """ + def __init__(self, operation='upd', selected_type=None, **kwargs): self.options = None self.selected = None @@ -383,7 +438,8 @@ def __init__(self, operation='upd', selected_type=None, **kwargs): self.name_key = self.__class__.__name__ def init_vars(self, **kwargs): - """ Initialize the primitive specific variables (which are not defined in the base class) + """ Initialize the primitive specific variables (which are not defined in the + base class) # Arguments kwargs: Dictionary containing primitive specific variables # Returns @@ -392,7 +448,8 @@ def init_vars(self, **kwargs): self.options = kwargs def fit(self, data, y=None): - """ A wrapper function to train the given primitive on the input training data. + """ A wrapper function to train the given primitive on the input training + data. # Arguments data: A TabularData instance of training data. y: A numpy array of the target values. @@ -401,14 +458,17 @@ def fit(self, data, y=None): """ self.selected = data.select_columns(self.selected_type) if self.operation not in self.supported_ops: - print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__)) + print("Operation {} not supported for {}".format(self.operation, + self.__class__.__name__) + ) self.selected = None if not self.selected: return self return self._fit(data, y) def transform(self, data, y=None): - """ A wrapper function to generate transformation on the input data based on pre-trained primitive. + """ A wrapper function to generate transformation on the input data based on + pre-trained primitive. # Arguments data: Input training/testing data in TabularData form. y: A numpy array of the target values. @@ -421,7 +481,8 @@ def transform(self, data, y=None): @abstractmethod def _fit(self, data, y=None): - """ Contains the actual implementation of training the primitive (implemented in the child class) + """ Contains the actual implementation of training the primitive (implemented + in the child class) # Arguments data: A TabularData instance of training data. y: A numpy array of the target values. @@ -432,8 +493,8 @@ def _fit(self, data, y=None): @abstractmethod def _transform(self, data, y=None): - """ Contains the actual implementation of transforming the data using primitive. (implemented in the child - class) + """ Contains the actual implementation of transforming the data using + primitive. (implemented in the child class) # Arguments data: Input training/testing data in TabularData form. y: A numpy array of the target values. @@ -446,25 +507,34 @@ def _transform(self, data, y=None): class PrimitiveHigherOrder: """ Base class for the cross-order data transformation function. - PrimitiveHigherOrder learns and applies the data transformation across two sets of features. The user can use - fit() and transform() functions to apply these transformations. + PrimitiveHigherOrder learns and applies the data transformation across two + sets of features. The user can use fit() and transform() functions to + apply these transformations. # Arguments options: Dict. Special arguments specific to the given primitive. - selected_type1: 'String'. Specifies the first type of features the transformation is supposed to be applied - to. - selected_type2: 'String'. Specifies the second type of features the transformation is supposed to be applied - to. - operation: 'String'. Specifies the type of operation from 'add', 'update' or 'delete' - name_key : 'String'. Signature key to rename the column after applying the primitive. - selected_1: 'List'. List of the selected features in the first set, on which the transformation will be + selected_type1: 'String'. Specifies the first type of features the + transformation is supposed to be applied to. + selected_type2: 'String'. Specifies the second type of features the + transformation is supposed to be applied to. + operation: 'String'. Specifies the type of operation from 'add', 'update' + or 'delete' + name_key : 'String'. Signature key to rename the column after applying + the primitive. + selected_1: 'List'. List of the selected features in the first set, on + which the transformation will be applied - selected_2: 'List'. List of the selected features in the second set, on which the transformation will be + selected_2: 'List'. List of the selected features in the second set, on + which the transformation will be applied - drop_columns: 'List'. List of the features which would be dropped after applying the transformation. - supported_ops: Tuple. Specifies the allowed list of operations for this primitive. + drop_columns: 'List'. List of the features which would be dropped after + applying the transformation. + supported_ops: Tuple. Specifies the allowed list of operations for this + primitive. """ - def __init__(self, operation='upd', selected_type1=None, selected_type2=None, **kwargs): + + def __init__(self, operation='upd', selected_type1=None, selected_type2=None, + **kwargs): self.options = None self.selected_1 = None self.selected_2 = None @@ -477,7 +547,8 @@ def __init__(self, operation='upd', selected_type1=None, selected_type2=None, ** self.name_key = self.__class__.__name__ def init_vars(self, **kwargs): - """ Initialize the primitive specific variables (which are not defined in the base class) + """ Initialize the primitive specific variables (which are not defined in the + base class) # Arguments kwargs: Dictionary containing primitive specific variables # Returns @@ -486,7 +557,8 @@ def init_vars(self, **kwargs): self.options = kwargs def fit(self, data, y=None): - """ A wrapper function to train the given primitive on the input training data. + """ A wrapper function to train the given primitive on the input training + data. # Arguments data: A TabularData instance of training data. y: A numpy array of the target values. @@ -497,7 +569,9 @@ def fit(self, data, y=None): self.selected_2 = data.select_columns(self.selected_type2) if self.operation not in self.supported_ops: - print("Operation {} not supported for {}".format(self.operation, self.__class__.__name__)) + print("Operation {} not supported for {}".format(self.operation, + self.__class__.__name__) + ) self.selected_1 = None self.selected_2 = None if not self.selected_1 or not self.selected_2: @@ -505,7 +579,8 @@ def fit(self, data, y=None): return self._fit(data, y) def transform(self, data, y=None): - """ A wrapper function to generate transformation on the input data based on pre-trained primitive. + """ A wrapper function to generate transformation on the input data based on + pre-trained primitive. # Arguments data: Input training/testing data in TabularData form. y: A numpy array of the target values. @@ -518,7 +593,8 @@ def transform(self, data, y=None): @abstractmethod def _fit(self, data, y=None): - """ Contains the actual implementation of training the primitive (implemented in the child class) + """ Contains the actual implementation of training the primitive (implemented + in the child class) # Arguments data: A TabularData instance of training data. y: A numpy array of the target values. @@ -529,8 +605,8 @@ def _fit(self, data, y=None): @abstractmethod def _transform(self, data, y=None): - """ Contains the actual implementation of transforming the data using primitive. (implemented in the child - class) + """ Contains the actual implementation of transforming the data using + primitive. (implemented in the child class) # Arguments data: Input training/testing data in TabularData form. y: A numpy array of the target values. @@ -543,7 +619,8 @@ def _transform(self, data, y=None): class TabScaler(Primitive): """ Standard Scaler primitive. - TabScaler scales the selected numerical features to have 0 mean and unit variance. + TabScaler scales the selected numerical features to have 0 mean and unit + variance. # Arguments scaler: StandardScaler. Instance of scikit-learn StandardScaler object @@ -558,17 +635,20 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = self.scaler.transform(data.X[self.selected]) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) return data class BoxCox(Primitive): """ Power Transform primitive. - The class applies BoxCox power transformation to make the selected features have normal distribution. + The class applies BoxCox power transformation to make the selected features + have normal distribution. # Arguments - transformer: PowerTransformer. Instance of scikit-learn PowerTransformer object + transformer: PowerTransformer. Instance of scikit-learn PowerTransformer + object """ transformer = None supported_ops = ('add', 'upd') @@ -580,20 +660,25 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = self.transformer.transform(data.X[self.selected]) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) return data class Binning(Primitive): """ Numerical binning primitive. - The class applies divides the given numeric column in the list of buckets, based on the range of their values. + The class applies divides the given numeric column in the list of buckets, + based on the range of their values. # Arguments - binner: KBinsDiscretizer. Instance of scikit-learn KBinsDiscretizer object - strategy: String. Strategy used to define width of the bins. Possible options are: (‘uniform’, ‘quantile’, + binner: KBinsDiscretizer. Instance of scikit-learn KBinsDiscretizer + object + strategy: String. Strategy used to define width of the bins. Possible + options are: (‘uniform’, ‘quantile’, ‘kmeans’) - encoding: String. Method used to encode the transformed result. Possible options are: (‘onehot’, + encoding: String. Method used to encode the transformed result. Possible + options are: (‘onehot’, ‘onehot-dense’, ‘ordinal’) """ binner = None @@ -612,14 +697,16 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame(self.binner.transform(data.X[self.selected])) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) return data class OneHot(Primitive): """ One Hot Encoder for categorical features. - The class applies one hot encoding to categorical features, using the sklearn implementation. + The class applies one hot encoding to categorical features, using the + sklearn implementation. # Arguments ohe: OneHotEncoder. Instance of scikit-learn OneHotEncoder object @@ -635,9 +722,11 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame(self.ohe.transform(data.X[self.selected])) if self.operation == 'add': - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) elif self.operation == 'upd': - data.update('add', self.selected, x_tr, new_type='NUM', key=self.name_key) + data.update('add', self.selected, x_tr, new_type='NUM', + key=self.name_key) data.update('del', self.selected, None, None, key=self.name_key) return data @@ -645,12 +734,14 @@ def _transform(self, data, y=None): class LabelEncode(Primitive): """ Label Encoder for categorical features. - The class applies Label Encoding to categorical features, By mapping each category to a numerical value. + The class applies Label Encoding to categorical features, By mapping each + category to a numerical value. # Arguments - cat_to_int_label: Dict. Mapping from categories to their assigned integer value - unknown_key_dict: Dict. Mapping for each categorical feature column to the integer value to replace the - previously unseen categories + cat_to_int_label: Dict. Mapping from categories to their assigned integer + value + unknown_key_dict: Dict. Mapping for each categorical feature column to + the integer value to replace the previously unseen categories """ cat_to_int_label = None unknown_key_dict = None @@ -660,26 +751,33 @@ def _fit(self, data, y=None): self.cat_to_int_label = {} self.unknown_key_dict = {} for col in self.selected: - self.cat_to_int_label[col] = {key: idx for idx, key in enumerate(set(data.X[col]))} + self.cat_to_int_label[col] = {key: idx for idx, key in + enumerate(set(data.X[col]))} self.unknown_key_dict[col] = len(self.cat_to_int_label[col]) return self def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: - x_tr[col] = data.X[col].apply(lambda key: self.cat_to_int_label[col].get(key, self.unknown_key_dict[col])) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + x_tr[col] = data.X[col].apply( + lambda key: self.cat_to_int_label[col].get(key, + self.unknown_key_dict[ + col])) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) return data class TargetEncoder(Primitive): """ Target Encoder for categorical features. - The class applies target encoding to categorical features, By learning the mapping of category to numeric value + The class applies target encoding to categorical features, By learning + the mapping of category to numeric value based on some aggregation of the target value. # Arguments - target_encoding_map: Dict. Mapping from categories to their assigned numeric value + target_encoding_map: Dict. Mapping from categories to their assigned + numeric value """ target_encoding_map = None supported_ops = ('add', 'upd') @@ -712,28 +810,34 @@ def _fit(self, data, y=None): self.target_encoding_map = {} X['target'] = y for col in self.selected: - self.target_encoding_map[col] = self.calc_smooth_mean(X, col, 'target', alpha=5) + self.target_encoding_map[col] = self.calc_smooth_mean(X, col, 'target', + alpha=5) X.drop('target', axis=1, inplace=True) return self def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: - x_tr[col] = data.X[col].map(self.target_encoding_map[col][0], self.target_encoding_map[col][1]) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + x_tr[col] = data.X[col].map(self.target_encoding_map[col][0], + self.target_encoding_map[col][1]) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) return data class CatCatEncoder(PrimitiveHigherOrder): """ Cross column feature generator between categorical and categorical columns. - The class learns a new features based on the values of selected two categorical features. + The class learns a new features based on the values of selected two + categorical features. # Arguments - cat_cat_map: Dict. Mapping from cat-cat combination to the an assigned numeric value - strategy: String. Aggregation strategy to learn the mapping between cat-cat combination to numeric value + cat_cat_map: Dict. Mapping from cat-cat combination to the an assigned + numeric value + strategy: String. Aggregation strategy to learn the mapping between + cat-cat combination to numeric value """ - supported_ops = ('add', ) + supported_ops = ('add',) cat_cat_map = None strategy = None @@ -742,7 +846,8 @@ def init_vars(self, strategy='count'): @staticmethod def cat_cat_count(df, col1, col2, strategy='count'): - """ Generate mapping for cat-cat combination to the numerical value based on the given strategy. + """ Generate mapping for cat-cat combination to the numerical value based on + the given strategy. # Arguments col1: First categorical column col2: Second categorical column @@ -755,35 +860,45 @@ def cat_cat_count(df, col1, col2, strategy='count'): elif strategy == 'nunique': mapping = df.groupby([col1])[col2].nunique() else: - mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[col2].nunique() + mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[ + col2].nunique() return mapping def _fit(self, data, y=None): self.cat_cat_map = {} self.selected_1 = list(set(self.selected_1 + self.selected_2)) for col1, col2 in itertools.combinations(self.selected_1, 2): - self.cat_cat_map[col1 + '_cross_' + col2] = self.cat_cat_count(data.X, col1, col2, self.strategy) + self.cat_cat_map[col1 + '_cross_' + col2] = \ + self.cat_cat_count(data.X, + col1, + col2, + self.strategy) return self def _transform(self, data, y=None): x_tr = pd.DataFrame() for col1, col2 in itertools.combinations(self.selected_1, 2): if col1 + '_cross_' + col2 in self.cat_cat_map: - x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_cat_map[col1 + '_cross_' + col2]) - data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key) + x_tr[col1 + '_cross_' + col2] = data.X[col1].map( + self.cat_cat_map[col1 + '_cross_' + col2]) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', + key=self.name_key) return data class CatNumEncoder(PrimitiveHigherOrder): """ Cross column feature generator between categorical and numerical columns. - The class learns a new features based on the values of selected categorical and numerical features. + The class learns a new features based on the values of selected categorical + and numerical features. # Arguments - cat_num_map: Dict. Mapping from cat-num combination to the an assigned numeric value - strategy: String. Aggregation strategy to learn the mapping between cat-num combination to numeric value + cat_num_map: Dict. Mapping from cat-num combination to the an assigned + numeric value + strategy: String. Aggregation strategy to learn the mapping between + cat-num combination to numeric value """ - supported_ops = ('add', ) + supported_ops = ('add',) cat_num_map = None strategy = None @@ -792,7 +907,8 @@ def init_vars(self, strategy='mean'): @staticmethod def cat_num_interaction(df, col1, col2, method='mean'): - """ Generate mapping for cat-num combination to the numerical value based on the given strategy. + """ Generate mapping for cat-num combination to the numerical value based on + the given strategy. # Arguments col1: categorical column col2: numerical column @@ -817,7 +933,8 @@ def _fit(self, data, y=None): self.cat_num_map = {} for col1 in self.selected_1: for col2 in self.selected_2: - self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(data.X, col1, col2, self.strategy) + self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction( + data.X, col1, col2, self.strategy) return self def _transform(self, data, y=None): @@ -825,21 +942,26 @@ def _transform(self, data, y=None): for col1 in self.selected_1: for col2 in self.selected_2: if col1 + '_cross_' + col2 in self.cat_num_map: - x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_num_map[col1 + '_cross_' + col2]) - data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key) + x_tr[col1 + '_cross_' + col2] = data.X[col1].map( + self.cat_num_map[col1 + '_cross_' + col2]) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', + key=self.name_key) return data class CatBinEncoder(PrimitiveHigherOrder): """ Cross column feature generator between categorical and binary columns. - The class learns a new features based on the values of selected categorical and binary features. + The class learns a new features based on the values of selected categorical + and binary features. # Arguments - cat_bin_map: Dict. Mapping from cat-bin combination to the an assigned numeric value - strategy: String. Aggregation strategy to learn the mapping between cat-bin combination to numeric value + cat_bin_map: Dict. Mapping from cat-bin combination to the an assigned + numeric value + strategy: String. Aggregation strategy to learn the mapping between + cat-bin combination to numeric value """ - supported_ops = ('add', ) + supported_ops = ('add',) cat_bin_map = None strategy = None @@ -848,7 +970,8 @@ def init_vars(self, strategy='percent_true'): @staticmethod def cat_bin_interaction(df, col1, col2, strategy='percent_true'): - """ Generate mapping for cat-bin combination to the numerical value based on the given strategy. + """ Generate mapping for cat-bin combination to the numerical value based on + the given strategy. # Arguments col1: Categorical column col2: Binary column @@ -868,7 +991,8 @@ def _fit(self, data, y=None): self.cat_bin_map = {} for col1 in self.selected_1: for col2 in self.selected_2: - self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(data.X, col1, col2, self.strategy) + self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction( + data.X, col1, col2, self.strategy) return self def _transform(self, data, y=None): @@ -876,16 +1000,18 @@ def _transform(self, data, y=None): for col1 in self.selected_1: for col2 in self.selected_2: if col1 + '_cross_' + col2 in self.cat_bin_map: - x_tr[col1 + '_cross_' + col2] = data.X[col1].map(self.cat_bin_map[col1 + '_cross_' + col2]) - data.update(self.operation, self.selected_1, x_tr, new_type='NUM', key=self.name_key) + x_tr[col1 + '_cross_' + col2] = data.X[col1].map( + self.cat_bin_map[col1 + '_cross_' + col2]) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', + key=self.name_key) return data class FilterConstant(Primitive): """ Filters the constant or very low variance columns. - The class finds the non-changing or very low variance columns and marked them for deletion, so that they are - not used by the machine learning estimator. + The class finds the non-changing or very low variance columns and marked them + for deletion, so that they are not used by the machine learning estimator. """ drop_columns = None supported_ops = ('del',) @@ -896,16 +1022,18 @@ def _fit(self, data, y=None): return self def _transform(self, data, y=None): - data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key) + data.update(self.operation, self.drop_columns, None, new_type=None, + key=self.name_key) return data class TimeDiff(Primitive): """ Adds features based on difference of time values. - This class generates the features as time difference between two selected time columns. + This class generates the features as time difference between two selected + time columns. """ - supported_ops = ('add', ) + supported_ops = ('add',) def _fit(self, data, y=None): return self @@ -914,14 +1042,16 @@ def _transform(self, data, y=None): x_tr = pd.DataFrame() for a, b in itertools.combinations(self.selected, 2): x_tr[a + '-' + b] = data.X[a] - data.X[b] - data.update(self.operation, self.selected, x_tr, new_type='TIME', key=self.name_key) + data.update(self.operation, self.selected, x_tr, new_type='TIME', + key=self.name_key) return data class TimeOffset(Primitive): """ Updates the time features in terms of difference from the start value. - This class updates the time features such that they are represented as a difference from the start time. + This class updates the time features such that they are represented as a + difference from the start time. # Arguments start_time: Int. Starting time of the selected time feature. @@ -936,19 +1066,21 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame() x_tr[self.selected] = data.X[self.selected] - self.start_time - data.update(self.operation, self.selected, x_tr, new_type='TIME', key=self.name_key) + data.update(self.operation, self.selected, x_tr, new_type='TIME', + key=self.name_key) return data class TabPCA(Primitive): """ Generates new features by finding PCA of the selected features. - The class calculates the PCA of the selected features and adds the transformation as new set of features. + The class calculates the PCA of the selected features and adds the + transformation as new set of features. # Arguments pca: PCA. Scikit-lean PCA class. """ pca = None - supported_ops = ('add', ) + supported_ops = ('add',) def _fit(self, data, y=None): self.pca = PCA(n_components=0.99, svd_solver='full') @@ -957,8 +1089,10 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_pca = self.pca.transform(data.X[self.selected]) - x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in range(x_pca.shape[1])]) - data.update(self.operation, self.selected, x_pca, new_type='NUM', key=self.name_key) + x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in + range(x_pca.shape[1])]) + data.update(self.operation, self.selected, x_pca, new_type='NUM', + key=self.name_key) return data @@ -967,7 +1101,8 @@ class CatCount(Primitive): Replaces the cargorical variables by their occrance count. # Arguments - count_dict: Dict. Mapping of the categories to their respective frequency count. + count_dict: Dict. Mapping of the categories to their respective frequency + count. unknown_key: Float. Mapping value for previously unseen category. """ count_dict = None @@ -983,16 +1118,18 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: - x_tr[col] = data.X[col].apply(lambda key: self.count_dict[col].get(key, self.unknown_key)) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + x_tr[col] = data.X[col].apply( + lambda key: self.count_dict[col].get(key, self.unknown_key)) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) return data class LogTransform(Primitive): """ Calculates the log transformation. - The class Calculates the log transform value of the given numeric feature. The formula is: - sign(x) * log(1 + mod(x)) + The class Calculates the log transform value of the given numeric feature. + The formula is: sign(x) * log(1 + mod(x)) """ name_key = 'log_' supported_ops = ('add', 'upd') @@ -1003,15 +1140,18 @@ def _fit(self, data, y=None): def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: - x_tr[self.name_key + col] = np.sign(data.X[col]) * np.log(1 + np.abs(data.X[col])) - data.update(self.operation, self.selected, x_tr, new_type='NUM', key=self.name_key) + x_tr[self.name_key + col] = np.sign(data.X[col]) * np.log( + 1 + np.abs(data.X[col])) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) return data class Imputation(Primitive): """ Filters the features based on Pearson Correlation. - The class removes the features who have low pearson correlation with the target. + The class removes the features who have low pearson correlation with the + target. # Arguments threshold: Float. Threshold for filtering features. """ @@ -1027,7 +1167,8 @@ def _fit(self, data, y=None): for col in self.selected: if self.strategy == 'most_frequent': value_counts = data.X[col].value_counts() - self.impute_dict[col] = value_counts.idxmax() if not value_counts.empty else 0 + self.impute_dict[ + col] = value_counts.idxmax() if not value_counts.empty else 0 elif self.strategy == 'zero': self.impute_dict[col] = 0 else: @@ -1038,14 +1179,16 @@ def _transform(self, data, y=None): x_tr = pd.DataFrame() for col in self.selected: x_tr[col] = data.X[col].fillna(self.impute_dict[col]) - data.update(self.operation, self.selected, x_tr, new_type=None, key=self.name_key) + data.update(self.operation, self.selected, x_tr, new_type=None, + key=self.name_key) return data class FeatureFilter(Primitive): """ Filters the features based on Pearson Correlation. - The class removes the features who have low pearson correlation with the target. + The class removes the features who have low pearson correlation with the + target. # Arguments threshold: Float. Threshold for filtering features. """ @@ -1068,18 +1211,21 @@ def _fit(self, data, y=None): return self def _transform(self, data, y=None): - data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key) + data.update(self.operation, self.drop_columns, None, new_type=None, + key=self.name_key) return data class FeatureImportance(Primitive): """ Filters the features based on feature importance score. - The class learns a Light GBM estimator for the given data and based on the feature importance scores, filters - the features with importance lower than the threshold. + The class learns a Light GBM estimator for the given data and based on the + feature importance scores, filters the features with importance lower than + the threshold. # Arguments threshold: Float. Threshold for filtering features. - task_type: 'String'. Specifies the task type amongst: ('classification', 'regression') + task_type: 'String'. Specifies the task type amongst: ('classification', + 'regression') """ threshold = None task_type = 'classification' @@ -1114,12 +1260,14 @@ def _fit(self, data, y=None): objective='regression') estimator.fit(data.X, y) feature_importance = estimator.feature_importances_ - feature_importance = feature_importance/feature_importance.mean() - self.drop_columns = data.X.columns[np.where(feature_importance < self.threshold)[0]] + feature_importance = feature_importance / feature_importance.mean() + self.drop_columns = data.X.columns[ + np.where(feature_importance < self.threshold)[0]] return self def _transform(self, data, y=None): - data.update(self.operation, self.drop_columns, None, new_type=None, key=self.name_key) + data.update(self.operation, self.drop_columns, None, new_type=None, + key=self.name_key) return data @@ -1141,8 +1289,8 @@ def _transform(self, data, y=None): datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) print(x_train[:4, 20]) prep = Preprocessor() - prep.fit(x_train, y_train, 24*60*60, datainfo) + prep.fit(x_train, y_train, 24 * 60 * 60, datainfo) x_new = prep.transform(x_train) print("-----") - print(x_new[:4, 2]) \ No newline at end of file + print(x_new[:4, 2]) diff --git a/autokaggle/utils.py b/autokaggle/utils.py index e93f7d8..bb3aa69 100644 --- a/autokaggle/utils.py +++ b/autokaggle/utils.py @@ -8,7 +8,8 @@ def generate_rand_string(size): chars = string.ascii_uppercase + string.digits return ''.join(random.choice(chars) for _ in range(size)) - + + def ensure_dir(directory): """Create directory if it does not exist.""" if not os.path.exists(directory): @@ -22,20 +23,26 @@ def temp_path_generator(): def rand_temp_folder_generator(): - """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras).""" + """ + Create and return a temporary directory with the path name + '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras). + """ sys_temp = temp_path_generator() path = sys_temp + '_' + generate_rand_string(6) ensure_dir(path) return path + def write_json(data, filename): with open(filename, 'w') as outfile: json.dump(data, outfile) - + + def read_json(filename): with open(filename, 'rb') as infile: return json.load(infile) + def write_csv(filename, line): with open(filename, "a") as f: f.write(", ".join(map(str, line))) diff --git a/examples/benchmarking.py b/examples/benchmarking.py index 0b5b22a..57cd47e 100644 --- a/examples/benchmarking.py +++ b/examples/benchmarking.py @@ -1,18 +1,21 @@ import string import random import sys + sys.path.append("../") import numpy as np import pandas as pd import sklearn.model_selection import sklearn.datasets -from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, balanced_accuracy_score,\ -mean_absolute_error, mean_squared_error +from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, \ + balanced_accuracy_score, \ + mean_absolute_error, mean_squared_error # from autosklearn.regression import AutoSklearnRegressor # from autosklearn.classification import AutoSklearnClassifier from autokaggle import * from autokaggle.utils import * import openml + openml.config.apikey = '3c7196c92a274c3b9405a7e26e9f848e' import warnings from abc import abstractmethod @@ -27,22 +30,25 @@ def generate_rand_string(size): class BenchmarkingBase: """ Base class for benchmarking autoML platforms. - This class benchmarks the performance of the given autoML platform. The user can call evaluate() method to - evaluate the performance on a single task or run_automation() for the list of the tasks. The tasks are OpenML + This class benchmarks the performance of the given autoML platform. The + user can call evaluate() method to evaluate the performance on a single + task or run_automation() for the list of the tasks. The tasks are OpenML tasks, which specify the dataset and the train/test/validation folds etc. # Arguments results: List. List of the results for each evaluation - sess_name: String. Name of the evaluation session, used for storing the results. + sess_name: String. Name of the evaluation session, used for storing + the results. cls_desc: List. List of the columns to be added in classification result rgs_desc: List. List of the columns to be added in regression result cls_results: DataFrame. Table storing the classification results rgs_results: DataFrame. Table storing the regression results """ results = None - cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", "balanced_accuracy", "F1_score", "AUC"] + cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", + "balanced_accuracy", "F1_score", "AUC"] rgs_desc = ["automl_model", "task_id", "time_limit", "MSE", "MAE", "R2_score"] - + def __init__(self, supress_warnings=True, sess_name=""): if supress_warnings: warnings.filterwarnings('ignore') @@ -50,7 +56,7 @@ def __init__(self, supress_warnings=True, sess_name=""): self.sess_name = generate_rand_string(6) if not sess_name else sess_name self.cls_results = pd.DataFrame(columns=self.cls_desc) self.rgs_results = pd.DataFrame(columns=self.rgs_desc) - + def measure_performance_cls(self, y_true, y_pred, binary=False): """ Calculate the performance of the classification task # Arguments @@ -62,7 +68,8 @@ def measure_performance_cls(self, y_true, y_pred, binary=False): """ accuracy = accuracy_score(y_true, y_pred) ber = balanced_accuracy_score(y_true, y_pred) - f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score(y_true, y_pred, average="weighted") + f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score( + y_true, y_pred, average="weighted") auc = roc_auc_score(y_true, y_pred) if binary else "-" return [accuracy, ber, f1, auc] @@ -78,7 +85,7 @@ def measure_performance_rgs(self, y_true, y_pred): mae = mean_absolute_error(y_true, y_pred) r2 = r2_score(y_true, y_pred) return [mse, mae, r2] - + def export_results(self): """ Writes the results to a CSV file. # Arguments @@ -87,10 +94,12 @@ def export_results(self): None """ if len(self.cls_results) > 0: - self.cls_results.to_csv(self.sess_name + "_classification_results.csv", index=False) + self.cls_results.to_csv(self.sess_name + "_classification_results.csv", + index=False) if len(self.rgs_results) > 0: - self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", index=False) - + self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", + index=False) + @abstractmethod def evaluate(self, task, time_limit): """ Evaluates the performance of the single task. @@ -101,8 +110,8 @@ def evaluate(self, task, time_limit): List of performance scores of the autoML system on the given task. """ pass - - def run_automation(self, task_list, time_limit=10*60): + + def run_automation(self, task_list, time_limit=10 * 60): """ Evaluate the list of the tasks in sequence # Arguments task_list: List of OpenML task ids @@ -116,22 +125,24 @@ def run_automation(self, task_list, time_limit=10*60): self.export_results() except: print("task: {} didnt work".format(task)) - - def time_lapse(self, task_id, time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]): + + def time_lapse(self, task_id, + time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]): """ Evaluate the task on different time_limits # Arguments task_id: Id of the OpenML task flow time_limits: List of the time_limits to test the performance on # Returns List of combined results of the autoML on each of the time_limit - This function evaluates and compares the performance of the autoML system on different time_limits. It is - helpful to understand the amount of improvement with increase in time budget + This function evaluates and compares the performance of the autoML system + on different time_limits. It is helpful to understand the amount of + improvement with increase in time budget """ tl_results = [] for time_limit in time_limits: tl_results.append(self.evaluate(task_id, time_limit=time_limit)) return tl_results - + def get_dataset_splits(self, task_id): """ Get the train/test splits for the given task # Arguments @@ -142,18 +153,21 @@ def get_dataset_splits(self, task_id): task = openml.tasks.get_task(task_id) train_indices, test_indices = task.get_train_test_split_indices() dataset = task.get_dataset() - X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array') + X, y, categorical_indicator, attribute_names = dataset.get_data( + target=task.target_name, dataset_format='array') x_train, y_train = X[train_indices], y[train_indices] x_test, y_test = X[test_indices], y[test_indices] return x_train, y_train, x_test, y_test - - + + class BenchmarkingAutoKaggle(BenchmarkingBase): """ Extends the benchmarking class for evaluating AutoKaggle. - This class evaluates the performance of AutoKaggle on the input classification or regression task_list. + This class evaluates the performance of AutoKaggle on the input + classification or regression task_list. """ + def get_data_info(self, dataset, num_cols): """ Get the info of each feature data type # Arguments @@ -176,8 +190,8 @@ def get_data_info(self, dataset, num_cols): else: data_info.append("CAT") return np.array(data_info) - - def evaluate(self, task_id, time_limit=10*60): + + def evaluate(self, task_id, time_limit=10 * 60): """ See base class. """ @@ -185,7 +199,8 @@ def evaluate(self, task_id, time_limit=10*60): task = openml.tasks.get_task(task_id) train_indices, test_indices = task.get_train_test_split_indices() dataset = task.get_dataset() - X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array') + X, y, categorical_indicator, attribute_names = dataset.get_data( + target=task.target_name, dataset_format='array') x_train, y_train = X[train_indices], y[train_indices] x_test, y_test = X[test_indices], y[test_indices] @@ -200,16 +215,17 @@ def evaluate(self, task_id, time_limit=10*60): automl = Regressor() else: print("UNSUPPORTED TASK_TYPE") - assert(0) + assert (0) automl.fit(x_train, y_train, time_limit=time_limit, data_info=data_info) # Evaluate y_hat = automl.predict(x_test) - + if task.task_type == 'Supervised Classification': is_binary = True if len(task.class_labels) <= 2 else False - result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary) + result = task_info + self.measure_performance_cls(y_test, y_hat, + binary=is_binary) self.cls_results.loc[len(self.cls_results)] = result elif task.task_type == 'Supervised Regression': result = task_info + self.measure_performance_rgs(y_test, y_hat) @@ -217,21 +233,26 @@ def evaluate(self, task_id, time_limit=10*60): print(result) return result -# -# class BenchmarkingAutoSklearn(BenchmarkingBase): + # + # class BenchmarkingAutoSklearn(BenchmarkingBase): """ Extends the benchmarking class for evaluating AutoSklearn. - This class evaluates the performance of AutoKaggle on the input classification or regression task_list. + This class evaluates the performance of AutoKaggle on the input + classification or regression task_list. """ + + # def get_data_info(self, categorical_indicator): -# return ['Categorical' if ci else 'Numerical' for ci in categorical_indicator] +# return ['Categorical' if ci else 'Numerical' for ci in categorical +# indicator] # # def evaluate(self, task_id, time_limit=10*60): # task_info = ["autosklearn", task_id, time_limit] # task = openml.tasks.get_task(task_id) # train_indices, test_indices = task.get_train_test_split_indices() # dataset = task.get_dataset() -# X, y, categorical_indicator, attribute_names = dataset.get_data(target=task.target_name, dataset_format='array') +# X, y, categorical_indicator, attribute_names = dataset.get_data( +# target=task.target_name, dataset_format='array') # # x_train, y_train = X[train_indices], y[train_indices] # x_test, y_test = X[test_indices], y[test_indices] @@ -257,7 +278,8 @@ def evaluate(self, task_id, time_limit=10*60): # y_hat = automl.predict(x_test) # if task.task_type == 'Supervised Classification': # is_binary = True if len(task.class_labels) <= 2 else False -# result = task_info + self.measure_performance_cls(y_test, y_hat, binary=is_binary) +# result = task_info + self.measure_performance_cls(y_test, y_hat, +# binary=is_binary) # self.cls_results.loc[len(self.cls_results)] = result # elif task.task_type == 'Supervised Regression': # result = task_info + self.measure_performance_rgs(y_test, y_hat) @@ -286,7 +308,8 @@ def get_task_info(task_ids): task_ids: List of ids of OpenML task flows. # Returns dataset_list: List of the dataset Ids. - task_types: List of the task type (such as 'binary/multiclass classification' or 'regression' + task_types: List of the task type (such as 'binary/multiclass + classification' or 'regression' """ task_types = [] dataset_list = [] @@ -294,8 +317,11 @@ def get_task_info(task_ids): task = openml.tasks.get_task(t_id) dataset = openml.datasets.get_dataset(task.dataset_id) if task.task_type_id == 1: - _, y, _, _ = dataset.get_data(target=task.target_name, dataset_format='array') - task_type = "Binary Classification" if len(set(y)) <= 2 else "Multiclass classification ({})".format(len(set(y))) + _, y, _, _ = dataset.get_data(target=task.target_name, + dataset_format='array') + task_type = "Binary Classification" if len( + set(y)) <= 2 else "Multiclass classification ({})".format( + len(set(y))) else: task_type = "Regression" task_types.append(task_type) @@ -309,16 +335,19 @@ def get_dataset_properties(task_ids): task_ids: List of ids of OpenML task flows # Returns Dataframe containing the info of each of the dataset. - This function provides the dataset info such as number of instances, number of numeric/nominal/string columns etc. + This function provides the dataset info such as number of instances, number of + numeric/nominal/string columns etc. """ dataset_list, task_types = get_task_info(task_ids) - df = pd.DataFrame(columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", "#Date"]) + df = pd.DataFrame( + columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", + "#Date"]) for i, dataset in enumerate(dataset_list): df.loc[i] = [ - dataset.name, + dataset.name, dataset.qualities["NumberOfInstances"], task_types[i], - len(dataset.get_features_by_type('numeric')), + len(dataset.get_features_by_type('numeric')), len(dataset.get_features_by_type('nominal')), len(dataset.get_features_by_type('string')), len(dataset.get_features_by_type('date')), @@ -332,23 +361,27 @@ def get_performance_table(filename, metric): filename: A csv file containing the results of AutoML runs metric: Scoring metric to be used for comparison # Returns - Pandas Dataframe listing the performance of different AutoML systems on the given datasets. - This function reads the results csv and converts it into the performance table based on the median of the results - for each task. + Pandas Dataframe listing the performance of different AutoML systems on + the given datasets. + This function reads the results csv and converts it into the performance table + based on the median of the results for each task. """ test = pd.read_csv(filename) perf = pd.DataFrame(columns=["Name", "AutoKaggle", "AutoSklearn", "H2O.ai"]) task_ids = list(set(test["task_id"])) dataset_ids = get_dataset_ids(task_ids) - + test = test.set_index(["task_id", "automl_model"]) test.sort_index(inplace=True) for i, t_id in enumerate(task_ids): try: name = openml.datasets.get_dataset(dataset_ids[i]).name - auto_kaggle = test.loc[(t_id, "autokaggle")][metric].median() if (t_id, "autokaggle") in test.index else np.nan - auto_sklearn = test.loc[(t_id, "autosklearn")][metric].median() if (t_id, "autosklearn") in test.index else np.nan - h2o_ai = test.loc[(t_id, "autosklearn")][metric].median() if (t_id, "autosklearn") in test.index else np.nan + auto_kaggle = test.loc[(t_id, "autokaggle")][metric].median()\ + if (t_id, "autokaggle") in test.index else np.nan + auto_sklearn = test.loc[(t_id, "autosklearn")][metric].median()\ + if (t_id, "autosklearn") in test.index else np.nan + h2o_ai = test.loc[(t_id, "autosklearn")][metric].median()\ + if (t_id, "autosklearn") in test.index else np.nan perf.loc[i] = [name, auto_kaggle, auto_sklearn, h2o_ai] except Exception as e: print(e) @@ -362,12 +395,14 @@ def style_results(res): # Returns Highlighed data-frame """ + def highlight_max(s): """ Highlight the maximum in a Series yellow. """ is_max = s == s.max() return ['background-color: yellow' if v else '' for v in is_max] + res = res.set_index("Name") res.style.apply(highlight_max, axis=1) return res @@ -381,7 +416,8 @@ def get_box_plot(results, task_id, metric): metric: Score metric considered for the box-plot # Returns None - Builds and displays the box plot showing the variance in results for the AutoML performance on the given dataset. + Builds and displays the box plot showing the variance in results for the + AutoML performance on the given dataset. """ auto_sklearn = list(results.loc[(task_id, "autosklearn")][metric]) auto_kaggle = list(results.loc[(task_id, "autokaggle")][metric]) @@ -391,13 +427,16 @@ def get_box_plot(results, task_id, metric): auto_sklearn.append(med_sk) while len(auto_sklearn) > len(auto_kaggle): auto_kaggle.append(med_ak) - temp = pd.DataFrame(data={"Autokaggle":auto_kaggle, "AutoSklearn":auto_sklearn}) + temp = pd.DataFrame( + data={"Autokaggle": auto_kaggle, "AutoSklearn": auto_sklearn}) temp.boxplot() if __name__ == "__main__": - regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, 4850, 4839] - classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, 14951, 59, 24, 146230, 31, 10101, + regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, + 4850, 4839] + classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, + 14951, 59, 24, 146230, 31, 10101, 9914, 3020, 3524, 3573, 3962] ak = BenchmarkingAutoKaggle(sess_name='test_perf') import time @@ -410,7 +449,8 @@ def get_box_plot(results, task_id, metric): np.random.seed(1001) random.seed(1001) import time + t1 = time.time() ak.evaluate(3021) t2 = time.time() - print(t2-t1) + print(t2 - t1)