From 007d190d21c84f0997b8b2f1fa82484c32e596e3 Mon Sep 17 00:00:00 2001
From: kseniyausovich <k_usovich@berkeley.edu>
Date: Fri, 5 Aug 2022 10:43:31 -0700
Subject: [PATCH 1/2] initial resample function

---
 src/pyuoi/linear_model/base.py | 27 +++++++++++++--------------
 src/pyuoi/utils.py             |  9 +++++++++
 tests/test_utils.py            | 22 +++++++++++++++++++++-
 3 files changed, 43 insertions(+), 15 deletions(-)

diff --git a/src/pyuoi/linear_model/base.py b/src/pyuoi/linear_model/base.py
index 20dd316c..a67a1a8e 100644
--- a/src/pyuoi/linear_model/base.py
+++ b/src/pyuoi/linear_model/base.py
@@ -3,7 +3,6 @@
 import logging
 from sklearn.linear_model._base import SparseCoefMixin
 from sklearn.metrics import r2_score, accuracy_score, log_loss
-from sklearn.model_selection import train_test_split
 from sklearn.utils import check_X_y
 from sklearn.preprocessing import StandardScaler
 
@@ -13,7 +12,7 @@
 from pyuoi.mpi_utils import (Gatherv_rows, Bcast_from_root)
 
 from .utils import stability_selection_to_threshold, intersection
-from ..utils import check_logger
+from ..utils import check_logger, resample
 
 
 class AbstractUoILinearModel(SparseCoefMixin, metaclass=_abc.ABCMeta):
@@ -248,10 +247,10 @@ def fit(self, X, y, stratify=None, verbose=False):
         for boot in range(self.n_boots_sel):
             if size > 1:
                 if rank == 0:
-                    rvals = train_test_split(np.arange(X.shape[0]),
-                                             test_size=1 - self.selection_frac,
-                                             stratify=stratify,
-                                             random_state=self.random_state)
+                    rvals = resample(np.arange(X.shape[0]),
+                                     train_frac=self.selection_frac,
+                                     stratify=stratify,
+                                     random_state=self.random_state)
                 else:
                     rvals = [None] * 2
                 rvals = [Bcast_from_root(rval, self.comm, root=0)
@@ -259,9 +258,9 @@ def fit(self, X, y, stratify=None, verbose=False):
                 if boot in my_boots.keys():
                     my_boots[boot] = rvals
             else:
-                my_boots[boot] = train_test_split(
+                my_boots[boot] = resample(
                     np.arange(X.shape[0]),
-                    test_size=1 - self.selection_frac,
+                    train_frac=self.selection_frac,
                     stratify=stratify,
                     random_state=self.random_state)
 
@@ -339,10 +338,10 @@ def fit(self, X, y, stratify=None, verbose=False):
         for boot in range(self.n_boots_est):
             if size > 1:
                 if rank == 0:
-                    rvals = train_test_split(np.arange(X.shape[0]),
-                                             test_size=1 - self.estimation_frac,
-                                             stratify=stratify,
-                                             random_state=self.random_state)
+                    rvals = resample(np.arange(X.shape[0]),
+                                     train_frac=self.estimation_frac,
+                                     stratify=stratify,
+                                     random_state=self.random_state)
                 else:
                     rvals = [None] * 2
                 rvals = [Bcast_from_root(rval, self.comm, root=0)
@@ -350,9 +349,9 @@ def fit(self, X, y, stratify=None, verbose=False):
                 if boot in my_boots.keys():
                     my_boots[boot] = rvals
             else:
-                my_boots[boot] = train_test_split(
+                my_boots[boot] = resample(
                     np.arange(X.shape[0]),
-                    test_size=1 - self.estimation_frac,
+                    train_frac=self.estimation_frac,
                     stratify=stratify,
                     random_state=self.random_state)
 
diff --git a/src/pyuoi/utils.py b/src/pyuoi/utils.py
index 1b3709d0..b561bfc0 100755
--- a/src/pyuoi/utils.py
+++ b/src/pyuoi/utils.py
@@ -1,6 +1,7 @@
 import numpy as np
 import sys
 import logging
+from sklearn.model_selection import train_test_split
 
 
 def softmax(y, axis=-1):
@@ -152,3 +153,11 @@ def check_logger(logger, name='uoi', comm=None):
         handler.setFormatter(logging.Formatter(fmt))
         ret.addHandler(handler)
     return ret
+
+
+def resample(X, train_frac, stratify, random_state):
+    rvals = train_test_split(np.arange(X.shape[0]),
+                             test_size=1 - train_frac,
+                             stratify=stratify,
+                             random_state=random_state)
+    return rvals
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 7675df7f..d1ca8b82 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,7 @@
 from pyuoi.linear_model.utils import stability_selection_to_threshold
 from pyuoi.linear_model.utils import intersection
 
-from pyuoi.utils import check_logger
+from pyuoi.utils import check_logger, resample
 
 import logging
 try:
@@ -331,3 +331,23 @@ def test_check_logger_exists():
     logger = logging.getLogger()
     ret = check_logger(logger)
     assert ret is logger
+
+
+def test_class_stratify_check():
+    selection_frac = 0.9
+    idx = np.arange(100)
+    y = np.tile(np.arange(5), 20)
+    train, test = resample(idx, selection_frac=selection_frac, random_state=0, stratify=y)
+
+    if int(np.ceil(len(idx) * selection_frac)) != len(train):
+        raise ValueError("Incorrect train size")
+    if (len(idx) - int(np.ceil(len(idx) * selection_frac))) != len(test):
+        raise ValueError("Incorrect test size")
+
+    classes, dist = np.unique(y, return_counts=True)
+
+    for cl, di in zip(classes, dist):
+        if int(np.ceil(di * selection_frac)) != sum(y[train] == cl):
+            raise ValueError(f"Incorrect train class size {cl}")
+        if di - int(np.ceil(di * selection_frac)) != sum(y[test] == cl):
+            raise ValueError(f"Incorrect test class size {cl}")

From fdb147f9dce720b99dbb9859501af182a29affe6 Mon Sep 17 00:00:00 2001
From: kseniyausovich <k_usovich@berkeley.edu>
Date: Fri, 5 Aug 2022 10:45:27 -0700
Subject: [PATCH 2/2] initial resample function

---
 tests/test_utils.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index d1ca8b82..231bd518 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -334,20 +334,20 @@ def test_check_logger_exists():
 
 
 def test_class_stratify_check():
-    selection_frac = 0.9
+    train_frac = 0.9
     idx = np.arange(100)
     y = np.tile(np.arange(5), 20)
-    train, test = resample(idx, selection_frac=selection_frac, random_state=0, stratify=y)
+    train, test = resample(idx, train_frac=train_frac, random_state=0, stratify=y)
 
-    if int(np.ceil(len(idx) * selection_frac)) != len(train):
+    if int(np.ceil(len(idx) * train_frac)) != len(train):
         raise ValueError("Incorrect train size")
-    if (len(idx) - int(np.ceil(len(idx) * selection_frac))) != len(test):
+    if (len(idx) - int(np.ceil(len(idx) * train_frac))) != len(test):
         raise ValueError("Incorrect test size")
 
     classes, dist = np.unique(y, return_counts=True)
 
     for cl, di in zip(classes, dist):
-        if int(np.ceil(di * selection_frac)) != sum(y[train] == cl):
+        if int(np.ceil(di * train_frac)) != sum(y[train] == cl):
             raise ValueError(f"Incorrect train class size {cl}")
-        if di - int(np.ceil(di * selection_frac)) != sum(y[test] == cl):
+        if di - int(np.ceil(di * train_frac)) != sum(y[test] == cl):
             raise ValueError(f"Incorrect test class size {cl}")