BouchardLab · kseniyausovich · Aug 5, 2022 · Aug 5, 2022
diff --git a/src/pyuoi/linear_model/base.py b/src/pyuoi/linear_model/base.py
@@ -3,7 +3,6 @@
 import logging
 from sklearn.linear_model._base import SparseCoefMixin
 from sklearn.metrics import r2_score, accuracy_score, log_loss
-from sklearn.model_selection import train_test_split
 from sklearn.utils import check_X_y
 from sklearn.preprocessing import StandardScaler
 
@@ -13,7 +12,7 @@
 from pyuoi.mpi_utils import (Gatherv_rows, Bcast_from_root)
 
 from .utils import stability_selection_to_threshold, intersection
-from ..utils import check_logger
+from ..utils import check_logger, resample
 
 
 class AbstractUoILinearModel(SparseCoefMixin, metaclass=_abc.ABCMeta):
@@ -248,20 +247,20 @@ def fit(self, X, y, stratify=None, verbose=False):
         for boot in range(self.n_boots_sel):
             if size > 1:
                 if rank == 0:
-                    rvals = train_test_split(np.arange(X.shape[0]),
-                                             test_size=1 - self.selection_frac,
-                                             stratify=stratify,
-                                             random_state=self.random_state)
+                    rvals = resample(np.arange(X.shape[0]),
+                                     train_frac=self.selection_frac,
+                                     stratify=stratify,
+                                     random_state=self.random_state)
                 else:
                     rvals = [None] * 2
                 rvals = [Bcast_from_root(rval, self.comm, root=0)
                          for rval in rvals]
                 if boot in my_boots.keys():
                     my_boots[boot] = rvals
             else:
-                my_boots[boot] = train_test_split(
+                my_boots[boot] = resample(
                     np.arange(X.shape[0]),
-                    test_size=1 - self.selection_frac,
+                    train_frac=self.selection_frac,
                     stratify=stratify,
                     random_state=self.random_state)
 
@@ -339,20 +338,20 @@ def fit(self, X, y, stratify=None, verbose=False):
         for boot in range(self.n_boots_est):
             if size > 1:
                 if rank == 0:
-                    rvals = train_test_split(np.arange(X.shape[0]),
-                                             test_size=1 - self.estimation_frac,
-                                             stratify=stratify,
-                                             random_state=self.random_state)
+                    rvals = resample(np.arange(X.shape[0]),
+                                     train_frac=self.estimation_frac,
+                                     stratify=stratify,
+                                     random_state=self.random_state)
                 else:
                     rvals = [None] * 2
                 rvals = [Bcast_from_root(rval, self.comm, root=0)
                          for rval in rvals]
                 if boot in my_boots.keys():
                     my_boots[boot] = rvals
             else:
-                my_boots[boot] = train_test_split(
+                my_boots[boot] = resample(
                     np.arange(X.shape[0]),
-                    test_size=1 - self.estimation_frac,
+                    train_frac=self.estimation_frac,
                     stratify=stratify,
                     random_state=self.random_state)
 

diff --git a/src/pyuoi/utils.py b/src/pyuoi/utils.py
@@ -1,6 +1,7 @@
 import numpy as np
 import sys
 import logging
+from sklearn.model_selection import train_test_split
 
 
 def softmax(y, axis=-1):
@@ -152,3 +153,11 @@ def check_logger(logger, name='uoi', comm=None):
         handler.setFormatter(logging.Formatter(fmt))
         ret.addHandler(handler)
     return ret
+
+
+def resample(X, train_frac, stratify, random_state):
+    rvals = train_test_split(np.arange(X.shape[0]),
+                             test_size=1 - train_frac,
+                             stratify=stratify,
+                             random_state=random_state)
+    return rvals
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -7,7 +7,7 @@
 from pyuoi.linear_model.utils import stability_selection_to_threshold
 from pyuoi.linear_model.utils import intersection
 
-from pyuoi.utils import check_logger
+from pyuoi.utils import check_logger, resample
 
 import logging
 try:
@@ -331,3 +331,23 @@ def test_check_logger_exists():
     logger = logging.getLogger()
     ret = check_logger(logger)
     assert ret is logger
+
+
+def test_class_stratify_check():
+    train_frac = 0.9
+    idx = np.arange(100)
+    y = np.tile(np.arange(5), 20)
+    train, test = resample(idx, train_frac=train_frac, random_state=0, stratify=y)
+
+    if int(np.ceil(len(idx) * train_frac)) != len(train):
+        raise ValueError("Incorrect train size")
+    if (len(idx) - int(np.ceil(len(idx) * train_frac))) != len(test):
+        raise ValueError("Incorrect test size")
+
+    classes, dist = np.unique(y, return_counts=True)
+
+    for cl, di in zip(classes, dist):
+        if int(np.ceil(di * train_frac)) != sum(y[train] == cl):
+            raise ValueError(f"Incorrect train class size {cl}")
+        if di - int(np.ceil(di * train_frac)) != sum(y[test] == cl):
+            raise ValueError(f"Incorrect test class size {cl}")