From 007d190d21c84f0997b8b2f1fa82484c32e596e3 Mon Sep 17 00:00:00 2001 From: kseniyausovich Date: Fri, 5 Aug 2022 10:43:31 -0700 Subject: [PATCH 1/2] initial resample function --- src/pyuoi/linear_model/base.py | 27 +++++++++++++-------------- src/pyuoi/utils.py | 9 +++++++++ tests/test_utils.py | 22 +++++++++++++++++++++- 3 files changed, 43 insertions(+), 15 deletions(-) diff --git a/src/pyuoi/linear_model/base.py b/src/pyuoi/linear_model/base.py index 20dd316c..a67a1a8e 100644 --- a/src/pyuoi/linear_model/base.py +++ b/src/pyuoi/linear_model/base.py @@ -3,7 +3,6 @@ import logging from sklearn.linear_model._base import SparseCoefMixin from sklearn.metrics import r2_score, accuracy_score, log_loss -from sklearn.model_selection import train_test_split from sklearn.utils import check_X_y from sklearn.preprocessing import StandardScaler @@ -13,7 +12,7 @@ from pyuoi.mpi_utils import (Gatherv_rows, Bcast_from_root) from .utils import stability_selection_to_threshold, intersection -from ..utils import check_logger +from ..utils import check_logger, resample class AbstractUoILinearModel(SparseCoefMixin, metaclass=_abc.ABCMeta): @@ -248,10 +247,10 @@ def fit(self, X, y, stratify=None, verbose=False): for boot in range(self.n_boots_sel): if size > 1: if rank == 0: - rvals = train_test_split(np.arange(X.shape[0]), - test_size=1 - self.selection_frac, - stratify=stratify, - random_state=self.random_state) + rvals = resample(np.arange(X.shape[0]), + train_frac=self.selection_frac, + stratify=stratify, + random_state=self.random_state) else: rvals = [None] * 2 rvals = [Bcast_from_root(rval, self.comm, root=0) @@ -259,9 +258,9 @@ def fit(self, X, y, stratify=None, verbose=False): if boot in my_boots.keys(): my_boots[boot] = rvals else: - my_boots[boot] = train_test_split( + my_boots[boot] = resample( np.arange(X.shape[0]), - test_size=1 - self.selection_frac, + train_frac=self.selection_frac, stratify=stratify, random_state=self.random_state) @@ -339,10 +338,10 @@ def fit(self, X, y, stratify=None, verbose=False): for boot in range(self.n_boots_est): if size > 1: if rank == 0: - rvals = train_test_split(np.arange(X.shape[0]), - test_size=1 - self.estimation_frac, - stratify=stratify, - random_state=self.random_state) + rvals = resample(np.arange(X.shape[0]), + train_frac=self.estimation_frac, + stratify=stratify, + random_state=self.random_state) else: rvals = [None] * 2 rvals = [Bcast_from_root(rval, self.comm, root=0) @@ -350,9 +349,9 @@ def fit(self, X, y, stratify=None, verbose=False): if boot in my_boots.keys(): my_boots[boot] = rvals else: - my_boots[boot] = train_test_split( + my_boots[boot] = resample( np.arange(X.shape[0]), - test_size=1 - self.estimation_frac, + train_frac=self.estimation_frac, stratify=stratify, random_state=self.random_state) diff --git a/src/pyuoi/utils.py b/src/pyuoi/utils.py index 1b3709d0..b561bfc0 100755 --- a/src/pyuoi/utils.py +++ b/src/pyuoi/utils.py @@ -1,6 +1,7 @@ import numpy as np import sys import logging +from sklearn.model_selection import train_test_split def softmax(y, axis=-1): @@ -152,3 +153,11 @@ def check_logger(logger, name='uoi', comm=None): handler.setFormatter(logging.Formatter(fmt)) ret.addHandler(handler) return ret + + +def resample(X, train_frac, stratify, random_state): + rvals = train_test_split(np.arange(X.shape[0]), + test_size=1 - train_frac, + stratify=stratify, + random_state=random_state) + return rvals diff --git a/tests/test_utils.py b/tests/test_utils.py index 7675df7f..d1ca8b82 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,7 +7,7 @@ from pyuoi.linear_model.utils import stability_selection_to_threshold from pyuoi.linear_model.utils import intersection -from pyuoi.utils import check_logger +from pyuoi.utils import check_logger, resample import logging try: @@ -331,3 +331,23 @@ def test_check_logger_exists(): logger = logging.getLogger() ret = check_logger(logger) assert ret is logger + + +def test_class_stratify_check(): + selection_frac = 0.9 + idx = np.arange(100) + y = np.tile(np.arange(5), 20) + train, test = resample(idx, selection_frac=selection_frac, random_state=0, stratify=y) + + if int(np.ceil(len(idx) * selection_frac)) != len(train): + raise ValueError("Incorrect train size") + if (len(idx) - int(np.ceil(len(idx) * selection_frac))) != len(test): + raise ValueError("Incorrect test size") + + classes, dist = np.unique(y, return_counts=True) + + for cl, di in zip(classes, dist): + if int(np.ceil(di * selection_frac)) != sum(y[train] == cl): + raise ValueError(f"Incorrect train class size {cl}") + if di - int(np.ceil(di * selection_frac)) != sum(y[test] == cl): + raise ValueError(f"Incorrect test class size {cl}") From fdb147f9dce720b99dbb9859501af182a29affe6 Mon Sep 17 00:00:00 2001 From: kseniyausovich Date: Fri, 5 Aug 2022 10:45:27 -0700 Subject: [PATCH 2/2] initial resample function --- tests/test_utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index d1ca8b82..231bd518 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -334,20 +334,20 @@ def test_check_logger_exists(): def test_class_stratify_check(): - selection_frac = 0.9 + train_frac = 0.9 idx = np.arange(100) y = np.tile(np.arange(5), 20) - train, test = resample(idx, selection_frac=selection_frac, random_state=0, stratify=y) + train, test = resample(idx, train_frac=train_frac, random_state=0, stratify=y) - if int(np.ceil(len(idx) * selection_frac)) != len(train): + if int(np.ceil(len(idx) * train_frac)) != len(train): raise ValueError("Incorrect train size") - if (len(idx) - int(np.ceil(len(idx) * selection_frac))) != len(test): + if (len(idx) - int(np.ceil(len(idx) * train_frac))) != len(test): raise ValueError("Incorrect test size") classes, dist = np.unique(y, return_counts=True) for cl, di in zip(classes, dist): - if int(np.ceil(di * selection_frac)) != sum(y[train] == cl): + if int(np.ceil(di * train_frac)) != sum(y[train] == cl): raise ValueError(f"Incorrect train class size {cl}") - if di - int(np.ceil(di * selection_frac)) != sum(y[test] == cl): + if di - int(np.ceil(di * train_frac)) != sum(y[test] == cl): raise ValueError(f"Incorrect test class size {cl}")