Skip to content

Commit ce52d83

Browse files
authored
Merge pull request #19 from genos/simplify-importing
Simplify Importing
2 parents 57a093a + 7dfb330 commit ce52d83

File tree

7 files changed

+318
-318
lines changed

7 files changed

+318
-318
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ The code below performs the simulation and calculates the 95% highest density in
4848
Included for reference in the image is the same dataset used in a classical bootstrap, to illustrate the comparative
4949
smoothness of the bayesian version.
5050
```
51-
from bayesian_bootstrap.bootstrap import mean, highest_density_interval
51+
from bayesian_bootstrap import mean, highest_density_interval
5252
posterior_samples = mean(X, 10000)
5353
l, r = highest_density_interval(posterior_samples)
5454
@@ -62,7 +62,7 @@ The above code uses the `mean` method to simulate the posterior distribution of
6262
following code demonstrates doing this for the posterior of the mean:
6363

6464
```
65-
from bayesian_bootstrap.bootstrap import bayesian_bootstrap
65+
from bayesian_bootstrap import bayesian_bootstrap
6666
posterior_samples = bayesian_bootstrap(X, np.mean, 10000, 100)
6767
```
6868

bayesian_bootstrap/__init__.py

Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
import numpy as np
2+
from copy import deepcopy
3+
4+
5+
def mean(X, n_replications, seed=None):
6+
"""Simulate the posterior distribution of the mean.
7+
8+
Parameter X: The observed data (array like)
9+
10+
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
11+
12+
Parameter seed: Seed for PRNG (default None)
13+
14+
Returns: Samples from the posterior
15+
"""
16+
weights = np.random.default_rng(seed).dirichlet(np.ones(len(X)), n_replications)
17+
return np.dot(X, weights.T)
18+
19+
20+
def var(X, n_replications, seed=None):
21+
"""Simulate the posterior distribution of the variance.
22+
23+
Parameter X: The observed data (array like)
24+
25+
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
26+
27+
Parameter seed: Seed for PRNG (default None)
28+
29+
Returns: Samples from the posterior
30+
"""
31+
samples = []
32+
weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications)
33+
for w in weights:
34+
samples.append(np.dot([x ** 2 for x in X], w) - np.dot(X, w) ** 2)
35+
return samples
36+
37+
38+
def covar(X, Y, n_replications, seed=None):
39+
"""Simulate the posterior distribution of the covariance.
40+
41+
Parameter X: The observed data, first variable (array like)
42+
43+
Parameter Y: The observed data, second (array like)
44+
45+
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
46+
47+
Parameter seed: Seed for PRNG (default None)
48+
49+
Returns: Samples from the posterior
50+
"""
51+
samples = []
52+
weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications)
53+
for w in weights:
54+
cv = _weighted_covariance(X, Y, w)
55+
samples.append(cv)
56+
return samples
57+
58+
59+
def pearsonr(X, Y, n_replications, seed=None):
60+
"""
61+
Pearson correlation coefficient and p-value for testing non-correlation.
62+
63+
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html
64+
65+
"""
66+
weights = np.random.default_rng(seed).dirichlet(np.ones(len(X)), n_replications)
67+
return _weighted_pearsonr(X, Y, weights)
68+
69+
70+
def _weighted_covariance(X, Y, w):
71+
X_mean = np.dot(X, w.T).reshape(-1, 1)
72+
Y_mean = np.dot(Y, w.T).reshape(-1, 1)
73+
# Another approach, but less efficient
74+
# np.diag(np.dot(w, (x - X_mean) * (y - Y_mean)).T)
75+
# https://stackoverflow.com/a/14759273
76+
return (w * ((X - X_mean) * (Y - Y_mean))).sum(-1)
77+
78+
79+
def _weighted_pearsonr(X, Y, w):
80+
"""
81+
Weighted Pearson correlation.
82+
83+
"""
84+
return _weighted_covariance(X, Y, w) / np.sqrt(_weighted_covariance(X, X, w) * _weighted_covariance(Y, Y, w))
85+
86+
87+
def _weighted_ls(X, w, y):
88+
x_rows, x_cols = X.shape
89+
w_matrix = np.array(w) * np.eye(x_rows)
90+
coef = np.dot(
91+
np.dot(np.dot(np.linalg.inv(np.dot(np.dot(X.T, w_matrix), X)), X.T), w_matrix),
92+
y,
93+
)
94+
return coef
95+
96+
97+
def linear_regression(X, y, n_replications, seed=None):
98+
coef_samples = []
99+
weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications)
100+
for w in weights:
101+
coef_samples.append(_weighted_ls(X, w, y))
102+
return np.vstack(coef_samples)
103+
104+
105+
def bayesian_bootstrap(X, statistic, n_replications, resample_size, low_mem=False, seed=None):
106+
"""Simulate the posterior distribution of the given statistic.
107+
108+
Parameter X: The observed data (array like)
109+
110+
Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
111+
112+
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
113+
114+
Parameter resample_size: The size of the dataset in each replication
115+
116+
Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
117+
less memory, but will run slower as a result.
118+
119+
Parameter seed: Seed for PRNG (default None)
120+
121+
Returns: Samples from the posterior
122+
"""
123+
if isinstance(X, list):
124+
X = np.array(X)
125+
samples = []
126+
rng = np.random.default_rng(seed)
127+
if low_mem:
128+
weights = (rng.dirichlet([1] * len(X)) for _ in range(n_replications))
129+
else:
130+
weights = rng.dirichlet([1] * len(X), n_replications)
131+
for w in weights:
132+
sample_index = rng.choice(range(len(X)), p=w, size=resample_size)
133+
resample_X = X[sample_index]
134+
s = statistic(resample_X)
135+
samples.append(s)
136+
return samples
137+
138+
139+
def bayesian_bootstrap_regression(X, y, statistic, n_replications, resample_size, low_mem=False, seed=None):
140+
"""Simulate the posterior distribution of a statistic that uses dependent and independent variables.
141+
142+
Parameter X: The observed data, independent variables (matrix like)
143+
144+
Parameter y: The observed data, dependent variable (array like)
145+
146+
Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number)
147+
148+
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
149+
150+
Parameter resample_size: The size of the dataset in each replication
151+
152+
Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory
153+
154+
Parameter seed: Seed for PRNG (default None)
155+
156+
Returns: Samples from the posterior
157+
"""
158+
samples = []
159+
X_arr = np.array(X)
160+
y_arr = np.array(y)
161+
rng = np.random.default_rng(seed)
162+
if low_mem:
163+
weights = (rng.dirichlet([1] * len(X)) for _ in range(n_replications))
164+
else:
165+
weights = rng.dirichlet([1] * len(X), n_replications)
166+
for w in weights:
167+
if resample_size is None:
168+
s = statistic(X, y, w)
169+
else:
170+
resample_i = rng.choice(range(len(X_arr)), p=w, size=resample_size)
171+
resample_X = X_arr[resample_i]
172+
resample_y = y_arr[resample_i]
173+
s = statistic(resample_X, resample_y)
174+
samples.append(s)
175+
176+
return samples
177+
178+
179+
class BayesianBootstrapBagging:
180+
"""A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor."""
181+
182+
def __init__(self, base_learner, n_replications, resample_size=None, low_mem=False, seed=None):
183+
"""Initialize the base learners of the ensemble.
184+
185+
Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict()
186+
method.
187+
188+
Parameter n_replications: The number of bootstrap replications to perform (positive integer)
189+
190+
Parameter resample_size: The size of the dataset in each replication
191+
192+
Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use
193+
less memory, but will run slower as a result.
194+
195+
Parameter seed: Seed for PRNG (default None)
196+
"""
197+
self.base_learner = base_learner
198+
self.n_replications = n_replications
199+
self.resample_size = resample_size
200+
self.memo = low_mem
201+
self.seed = seed
202+
203+
def fit(self, X, y):
204+
"""Fit the base learners of the ensemble on a dataset.
205+
206+
Parameter X: The observed data, independent variables (matrix like)
207+
208+
Parameter y: The observed data, dependent variable (array like)
209+
210+
Returns: Fitted model
211+
"""
212+
if self.resample_size is None:
213+
statistic = lambda X, y, w: deepcopy(self.base_learner).fit(X, y, w) # noqa: E731
214+
else:
215+
statistic = lambda X, y: deepcopy(self.base_learner).fit(X, y) # noqa: E731
216+
self.base_models_ = bayesian_bootstrap_regression(
217+
X, y, statistic, self.n_replications, self.resample_size, low_mem=self.memo, seed=self.seed
218+
)
219+
return self
220+
221+
def predict(self, X):
222+
"""Make average predictions for a collection of observations.
223+
224+
Parameter X: The observed data, independent variables (matrix like)
225+
226+
Returns: The predicted dependent variable values (array like)
227+
"""
228+
y_posterior_samples = self.predict_posterior_samples(X)
229+
return np.array([np.mean(r) for r in y_posterior_samples])
230+
231+
def predict_posterior_samples(self, X):
232+
"""Simulate posterior samples for a collection of observations.
233+
234+
Parameter X: The observed data, independent variables (matrix like)
235+
236+
Returns: The simulated posterior mean (matrix like)
237+
"""
238+
# Return a X_r x self.n_replications matrix
239+
y_posterior_samples = np.zeros((len(X), self.n_replications))
240+
for i, m in enumerate(self.base_models_):
241+
y_posterior_samples[:, i] = m.predict(X)
242+
return y_posterior_samples
243+
244+
def predict_central_interval(self, X, alpha=0.05):
245+
"""The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples.
246+
247+
Parameter X: The observed data, independent variables (matrix like)
248+
249+
Parameter alpha: The total size of the tails (Float between 0 and 1)
250+
251+
Returns: Left and right interval bounds for each input (matrix like)
252+
"""
253+
y_posterior_samples = self.predict_posterior_samples(X)
254+
return np.array([central_credible_interval(r, alpha=alpha) for r in y_posterior_samples])
255+
256+
def predict_highest_density_interval(self, X, alpha=0.05):
257+
"""The highest density interval prediction containing a (1-alpha) fraction of the posterior samples.
258+
259+
Parameter X: The observed data, independent variables (matrix like)
260+
261+
Parameter alpha: The total size of the tails (Float between 0 and 1)
262+
263+
Returns: Left and right interval bounds for each input (matrix like):
264+
"""
265+
y_posterior_samples = self.predict_posterior_samples(X)
266+
return np.array([highest_density_interval(r, alpha=alpha) for r in y_posterior_samples])
267+
268+
269+
def central_credible_interval(samples, alpha=0.05):
270+
"""The equal-tailed interval containing a (1-alpha) fraction of the posterior samples.
271+
272+
Parameter samples: The posterior samples (array like)
273+
274+
Parameter alpha: The total size of the tails (Float between 0 and 1)
275+
276+
Returns: Left and right interval bounds (tuple)
277+
"""
278+
tail_size = int(round(len(samples) * (alpha / 2)))
279+
samples_sorted = sorted(samples)
280+
return samples_sorted[tail_size], samples_sorted[-tail_size - 1]
281+
282+
283+
def highest_density_interval(samples, alpha=0.05):
284+
"""The highest-density interval containing a (1-alpha) fraction of the posterior samples.
285+
286+
Parameter samples: The posterior samples (array like)
287+
288+
Parameter alpha: The total size of the tails (Float between 0 and 1)
289+
290+
Returns: Left and right interval bounds (tuple)
291+
"""
292+
samples_sorted = sorted(samples)
293+
window_size = int(len(samples) - round(len(samples) * alpha))
294+
smallest_window = (None, None)
295+
smallest_window_length = float("inf")
296+
for i in range(len(samples_sorted) - window_size):
297+
window = samples_sorted[i + window_size - 1], samples_sorted[i]
298+
window_length = samples_sorted[i + window_size - 1] - samples_sorted[i]
299+
if window_length < smallest_window_length:
300+
smallest_window_length = window_length
301+
smallest_window = window
302+
return smallest_window[1], smallest_window[0]
303+
304+
305+
def _bootstrap_replicate(X, seed=None):
306+
random_points = sorted(np.random.default_rng(seed).uniform(0, 1, len(X) - 1))
307+
random_points.append(1)
308+
random_points.insert(0, 0)
309+
gaps = [right - left for left, right in zip(random_points[:-1], random_points[1:])]
310+
return np.array(gaps)

0 commit comments

Comments
 (0)