Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 13 additions & 14 deletions src/pyuoi/linear_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
from sklearn.linear_model._base import SparseCoefMixin
from sklearn.metrics import r2_score, accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.utils import check_X_y
from sklearn.preprocessing import StandardScaler

Expand All @@ -13,7 +12,7 @@
from pyuoi.mpi_utils import (Gatherv_rows, Bcast_from_root)

from .utils import stability_selection_to_threshold, intersection
from ..utils import check_logger
from ..utils import check_logger, resample


class AbstractUoILinearModel(SparseCoefMixin, metaclass=_abc.ABCMeta):
Expand Down Expand Up @@ -248,20 +247,20 @@ def fit(self, X, y, stratify=None, verbose=False):
for boot in range(self.n_boots_sel):
if size > 1:
if rank == 0:
rvals = train_test_split(np.arange(X.shape[0]),
test_size=1 - self.selection_frac,
stratify=stratify,
random_state=self.random_state)
rvals = resample(np.arange(X.shape[0]),
train_frac=self.selection_frac,
stratify=stratify,
random_state=self.random_state)
else:
rvals = [None] * 2
rvals = [Bcast_from_root(rval, self.comm, root=0)
for rval in rvals]
if boot in my_boots.keys():
my_boots[boot] = rvals
else:
my_boots[boot] = train_test_split(
my_boots[boot] = resample(
np.arange(X.shape[0]),
test_size=1 - self.selection_frac,
train_frac=self.selection_frac,
stratify=stratify,
random_state=self.random_state)

Expand Down Expand Up @@ -339,20 +338,20 @@ def fit(self, X, y, stratify=None, verbose=False):
for boot in range(self.n_boots_est):
if size > 1:
if rank == 0:
rvals = train_test_split(np.arange(X.shape[0]),
test_size=1 - self.estimation_frac,
stratify=stratify,
random_state=self.random_state)
rvals = resample(np.arange(X.shape[0]),
train_frac=self.estimation_frac,
stratify=stratify,
random_state=self.random_state)
else:
rvals = [None] * 2
rvals = [Bcast_from_root(rval, self.comm, root=0)
for rval in rvals]
if boot in my_boots.keys():
my_boots[boot] = rvals
else:
my_boots[boot] = train_test_split(
my_boots[boot] = resample(
np.arange(X.shape[0]),
test_size=1 - self.estimation_frac,
train_frac=self.estimation_frac,
stratify=stratify,
random_state=self.random_state)

Expand Down
9 changes: 9 additions & 0 deletions src/pyuoi/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import sys
import logging
from sklearn.model_selection import train_test_split


def softmax(y, axis=-1):
Expand Down Expand Up @@ -152,3 +153,11 @@ def check_logger(logger, name='uoi', comm=None):
handler.setFormatter(logging.Formatter(fmt))
ret.addHandler(handler)
return ret


def resample(X, train_frac, stratify, random_state):
rvals = train_test_split(np.arange(X.shape[0]),
test_size=1 - train_frac,
stratify=stratify,
random_state=random_state)
return rvals
22 changes: 21 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pyuoi.linear_model.utils import stability_selection_to_threshold
from pyuoi.linear_model.utils import intersection

from pyuoi.utils import check_logger
from pyuoi.utils import check_logger, resample

import logging
try:
Expand Down Expand Up @@ -331,3 +331,23 @@ def test_check_logger_exists():
logger = logging.getLogger()
ret = check_logger(logger)
assert ret is logger


def test_class_stratify_check():
train_frac = 0.9
idx = np.arange(100)
y = np.tile(np.arange(5), 20)
train, test = resample(idx, train_frac=train_frac, random_state=0, stratify=y)

if int(np.ceil(len(idx) * train_frac)) != len(train):
raise ValueError("Incorrect train size")
if (len(idx) - int(np.ceil(len(idx) * train_frac))) != len(test):
raise ValueError("Incorrect test size")

classes, dist = np.unique(y, return_counts=True)

for cl, di in zip(classes, dist):
if int(np.ceil(di * train_frac)) != sum(y[train] == cl):
raise ValueError(f"Incorrect train class size {cl}")
if di - int(np.ceil(di * train_frac)) != sum(y[test] == cl):
raise ValueError(f"Incorrect test class size {cl}")