|
| 1 | +import numpy as np |
| 2 | +from copy import deepcopy |
| 3 | + |
| 4 | + |
| 5 | +def mean(X, n_replications, seed=None): |
| 6 | + """Simulate the posterior distribution of the mean. |
| 7 | +
|
| 8 | + Parameter X: The observed data (array like) |
| 9 | +
|
| 10 | + Parameter n_replications: The number of bootstrap replications to perform (positive integer) |
| 11 | +
|
| 12 | + Parameter seed: Seed for PRNG (default None) |
| 13 | +
|
| 14 | + Returns: Samples from the posterior |
| 15 | + """ |
| 16 | + weights = np.random.default_rng(seed).dirichlet(np.ones(len(X)), n_replications) |
| 17 | + return np.dot(X, weights.T) |
| 18 | + |
| 19 | + |
| 20 | +def var(X, n_replications, seed=None): |
| 21 | + """Simulate the posterior distribution of the variance. |
| 22 | +
|
| 23 | + Parameter X: The observed data (array like) |
| 24 | +
|
| 25 | + Parameter n_replications: The number of bootstrap replications to perform (positive integer) |
| 26 | +
|
| 27 | + Parameter seed: Seed for PRNG (default None) |
| 28 | +
|
| 29 | + Returns: Samples from the posterior |
| 30 | + """ |
| 31 | + samples = [] |
| 32 | + weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications) |
| 33 | + for w in weights: |
| 34 | + samples.append(np.dot([x ** 2 for x in X], w) - np.dot(X, w) ** 2) |
| 35 | + return samples |
| 36 | + |
| 37 | + |
| 38 | +def covar(X, Y, n_replications, seed=None): |
| 39 | + """Simulate the posterior distribution of the covariance. |
| 40 | +
|
| 41 | + Parameter X: The observed data, first variable (array like) |
| 42 | +
|
| 43 | + Parameter Y: The observed data, second (array like) |
| 44 | +
|
| 45 | + Parameter n_replications: The number of bootstrap replications to perform (positive integer) |
| 46 | +
|
| 47 | + Parameter seed: Seed for PRNG (default None) |
| 48 | +
|
| 49 | + Returns: Samples from the posterior |
| 50 | + """ |
| 51 | + samples = [] |
| 52 | + weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications) |
| 53 | + for w in weights: |
| 54 | + cv = _weighted_covariance(X, Y, w) |
| 55 | + samples.append(cv) |
| 56 | + return samples |
| 57 | + |
| 58 | + |
| 59 | +def pearsonr(X, Y, n_replications, seed=None): |
| 60 | + """ |
| 61 | + Pearson correlation coefficient and p-value for testing non-correlation. |
| 62 | +
|
| 63 | + https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html |
| 64 | +
|
| 65 | + """ |
| 66 | + weights = np.random.default_rng(seed).dirichlet(np.ones(len(X)), n_replications) |
| 67 | + return _weighted_pearsonr(X, Y, weights) |
| 68 | + |
| 69 | + |
| 70 | +def _weighted_covariance(X, Y, w): |
| 71 | + X_mean = np.dot(X, w.T).reshape(-1, 1) |
| 72 | + Y_mean = np.dot(Y, w.T).reshape(-1, 1) |
| 73 | + # Another approach, but less efficient |
| 74 | + # np.diag(np.dot(w, (x - X_mean) * (y - Y_mean)).T) |
| 75 | + # https://stackoverflow.com/a/14759273 |
| 76 | + return (w * ((X - X_mean) * (Y - Y_mean))).sum(-1) |
| 77 | + |
| 78 | + |
| 79 | +def _weighted_pearsonr(X, Y, w): |
| 80 | + """ |
| 81 | + Weighted Pearson correlation. |
| 82 | +
|
| 83 | + """ |
| 84 | + return _weighted_covariance(X, Y, w) / np.sqrt(_weighted_covariance(X, X, w) * _weighted_covariance(Y, Y, w)) |
| 85 | + |
| 86 | + |
| 87 | +def _weighted_ls(X, w, y): |
| 88 | + x_rows, x_cols = X.shape |
| 89 | + w_matrix = np.array(w) * np.eye(x_rows) |
| 90 | + coef = np.dot( |
| 91 | + np.dot(np.dot(np.linalg.inv(np.dot(np.dot(X.T, w_matrix), X)), X.T), w_matrix), |
| 92 | + y, |
| 93 | + ) |
| 94 | + return coef |
| 95 | + |
| 96 | + |
| 97 | +def linear_regression(X, y, n_replications, seed=None): |
| 98 | + coef_samples = [] |
| 99 | + weights = np.random.default_rng(seed).dirichlet([1] * len(X), n_replications) |
| 100 | + for w in weights: |
| 101 | + coef_samples.append(_weighted_ls(X, w, y)) |
| 102 | + return np.vstack(coef_samples) |
| 103 | + |
| 104 | + |
| 105 | +def bayesian_bootstrap(X, statistic, n_replications, resample_size, low_mem=False, seed=None): |
| 106 | + """Simulate the posterior distribution of the given statistic. |
| 107 | +
|
| 108 | + Parameter X: The observed data (array like) |
| 109 | +
|
| 110 | + Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number) |
| 111 | +
|
| 112 | + Parameter n_replications: The number of bootstrap replications to perform (positive integer) |
| 113 | +
|
| 114 | + Parameter resample_size: The size of the dataset in each replication |
| 115 | +
|
| 116 | + Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use |
| 117 | + less memory, but will run slower as a result. |
| 118 | +
|
| 119 | + Parameter seed: Seed for PRNG (default None) |
| 120 | +
|
| 121 | + Returns: Samples from the posterior |
| 122 | + """ |
| 123 | + if isinstance(X, list): |
| 124 | + X = np.array(X) |
| 125 | + samples = [] |
| 126 | + rng = np.random.default_rng(seed) |
| 127 | + if low_mem: |
| 128 | + weights = (rng.dirichlet([1] * len(X)) for _ in range(n_replications)) |
| 129 | + else: |
| 130 | + weights = rng.dirichlet([1] * len(X), n_replications) |
| 131 | + for w in weights: |
| 132 | + sample_index = rng.choice(range(len(X)), p=w, size=resample_size) |
| 133 | + resample_X = X[sample_index] |
| 134 | + s = statistic(resample_X) |
| 135 | + samples.append(s) |
| 136 | + return samples |
| 137 | + |
| 138 | + |
| 139 | +def bayesian_bootstrap_regression(X, y, statistic, n_replications, resample_size, low_mem=False, seed=None): |
| 140 | + """Simulate the posterior distribution of a statistic that uses dependent and independent variables. |
| 141 | +
|
| 142 | + Parameter X: The observed data, independent variables (matrix like) |
| 143 | +
|
| 144 | + Parameter y: The observed data, dependent variable (array like) |
| 145 | +
|
| 146 | + Parameter statistic: A function of the data to use in simulation (Function mapping array-like to number) |
| 147 | +
|
| 148 | + Parameter n_replications: The number of bootstrap replications to perform (positive integer) |
| 149 | +
|
| 150 | + Parameter resample_size: The size of the dataset in each replication |
| 151 | +
|
| 152 | + Parameter low_mem(bool): Use looping instead of generating all the dirichlet, use if program use too much memory |
| 153 | +
|
| 154 | + Parameter seed: Seed for PRNG (default None) |
| 155 | +
|
| 156 | + Returns: Samples from the posterior |
| 157 | + """ |
| 158 | + samples = [] |
| 159 | + X_arr = np.array(X) |
| 160 | + y_arr = np.array(y) |
| 161 | + rng = np.random.default_rng(seed) |
| 162 | + if low_mem: |
| 163 | + weights = (rng.dirichlet([1] * len(X)) for _ in range(n_replications)) |
| 164 | + else: |
| 165 | + weights = rng.dirichlet([1] * len(X), n_replications) |
| 166 | + for w in weights: |
| 167 | + if resample_size is None: |
| 168 | + s = statistic(X, y, w) |
| 169 | + else: |
| 170 | + resample_i = rng.choice(range(len(X_arr)), p=w, size=resample_size) |
| 171 | + resample_X = X_arr[resample_i] |
| 172 | + resample_y = y_arr[resample_i] |
| 173 | + s = statistic(resample_X, resample_y) |
| 174 | + samples.append(s) |
| 175 | + |
| 176 | + return samples |
| 177 | + |
| 178 | + |
| 179 | +class BayesianBootstrapBagging: |
| 180 | + """A bootstrap aggregating model using the bayesian bootstrap. Similar to scikit-learn's BaggingRegressor.""" |
| 181 | + |
| 182 | + def __init__(self, base_learner, n_replications, resample_size=None, low_mem=False, seed=None): |
| 183 | + """Initialize the base learners of the ensemble. |
| 184 | +
|
| 185 | + Parameter base_learner: A scikit-learn like estimator. This object should implement a fit() and predict() |
| 186 | + method. |
| 187 | +
|
| 188 | + Parameter n_replications: The number of bootstrap replications to perform (positive integer) |
| 189 | +
|
| 190 | + Parameter resample_size: The size of the dataset in each replication |
| 191 | +
|
| 192 | + Parameter low_mem(bool): Generate the weights for each iteration lazily instead of in a single batch. Will use |
| 193 | + less memory, but will run slower as a result. |
| 194 | +
|
| 195 | + Parameter seed: Seed for PRNG (default None) |
| 196 | + """ |
| 197 | + self.base_learner = base_learner |
| 198 | + self.n_replications = n_replications |
| 199 | + self.resample_size = resample_size |
| 200 | + self.memo = low_mem |
| 201 | + self.seed = seed |
| 202 | + |
| 203 | + def fit(self, X, y): |
| 204 | + """Fit the base learners of the ensemble on a dataset. |
| 205 | +
|
| 206 | + Parameter X: The observed data, independent variables (matrix like) |
| 207 | +
|
| 208 | + Parameter y: The observed data, dependent variable (array like) |
| 209 | +
|
| 210 | + Returns: Fitted model |
| 211 | + """ |
| 212 | + if self.resample_size is None: |
| 213 | + statistic = lambda X, y, w: deepcopy(self.base_learner).fit(X, y, w) # noqa: E731 |
| 214 | + else: |
| 215 | + statistic = lambda X, y: deepcopy(self.base_learner).fit(X, y) # noqa: E731 |
| 216 | + self.base_models_ = bayesian_bootstrap_regression( |
| 217 | + X, y, statistic, self.n_replications, self.resample_size, low_mem=self.memo, seed=self.seed |
| 218 | + ) |
| 219 | + return self |
| 220 | + |
| 221 | + def predict(self, X): |
| 222 | + """Make average predictions for a collection of observations. |
| 223 | +
|
| 224 | + Parameter X: The observed data, independent variables (matrix like) |
| 225 | +
|
| 226 | + Returns: The predicted dependent variable values (array like) |
| 227 | + """ |
| 228 | + y_posterior_samples = self.predict_posterior_samples(X) |
| 229 | + return np.array([np.mean(r) for r in y_posterior_samples]) |
| 230 | + |
| 231 | + def predict_posterior_samples(self, X): |
| 232 | + """Simulate posterior samples for a collection of observations. |
| 233 | +
|
| 234 | + Parameter X: The observed data, independent variables (matrix like) |
| 235 | +
|
| 236 | + Returns: The simulated posterior mean (matrix like) |
| 237 | + """ |
| 238 | + # Return a X_r x self.n_replications matrix |
| 239 | + y_posterior_samples = np.zeros((len(X), self.n_replications)) |
| 240 | + for i, m in enumerate(self.base_models_): |
| 241 | + y_posterior_samples[:, i] = m.predict(X) |
| 242 | + return y_posterior_samples |
| 243 | + |
| 244 | + def predict_central_interval(self, X, alpha=0.05): |
| 245 | + """The equal-tailed interval prediction containing a (1-alpha) fraction of the posterior samples. |
| 246 | +
|
| 247 | + Parameter X: The observed data, independent variables (matrix like) |
| 248 | +
|
| 249 | + Parameter alpha: The total size of the tails (Float between 0 and 1) |
| 250 | +
|
| 251 | + Returns: Left and right interval bounds for each input (matrix like) |
| 252 | + """ |
| 253 | + y_posterior_samples = self.predict_posterior_samples(X) |
| 254 | + return np.array([central_credible_interval(r, alpha=alpha) for r in y_posterior_samples]) |
| 255 | + |
| 256 | + def predict_highest_density_interval(self, X, alpha=0.05): |
| 257 | + """The highest density interval prediction containing a (1-alpha) fraction of the posterior samples. |
| 258 | +
|
| 259 | + Parameter X: The observed data, independent variables (matrix like) |
| 260 | +
|
| 261 | + Parameter alpha: The total size of the tails (Float between 0 and 1) |
| 262 | +
|
| 263 | + Returns: Left and right interval bounds for each input (matrix like): |
| 264 | + """ |
| 265 | + y_posterior_samples = self.predict_posterior_samples(X) |
| 266 | + return np.array([highest_density_interval(r, alpha=alpha) for r in y_posterior_samples]) |
| 267 | + |
| 268 | + |
| 269 | +def central_credible_interval(samples, alpha=0.05): |
| 270 | + """The equal-tailed interval containing a (1-alpha) fraction of the posterior samples. |
| 271 | +
|
| 272 | + Parameter samples: The posterior samples (array like) |
| 273 | +
|
| 274 | + Parameter alpha: The total size of the tails (Float between 0 and 1) |
| 275 | +
|
| 276 | + Returns: Left and right interval bounds (tuple) |
| 277 | + """ |
| 278 | + tail_size = int(round(len(samples) * (alpha / 2))) |
| 279 | + samples_sorted = sorted(samples) |
| 280 | + return samples_sorted[tail_size], samples_sorted[-tail_size - 1] |
| 281 | + |
| 282 | + |
| 283 | +def highest_density_interval(samples, alpha=0.05): |
| 284 | + """The highest-density interval containing a (1-alpha) fraction of the posterior samples. |
| 285 | +
|
| 286 | + Parameter samples: The posterior samples (array like) |
| 287 | +
|
| 288 | + Parameter alpha: The total size of the tails (Float between 0 and 1) |
| 289 | +
|
| 290 | + Returns: Left and right interval bounds (tuple) |
| 291 | + """ |
| 292 | + samples_sorted = sorted(samples) |
| 293 | + window_size = int(len(samples) - round(len(samples) * alpha)) |
| 294 | + smallest_window = (None, None) |
| 295 | + smallest_window_length = float("inf") |
| 296 | + for i in range(len(samples_sorted) - window_size): |
| 297 | + window = samples_sorted[i + window_size - 1], samples_sorted[i] |
| 298 | + window_length = samples_sorted[i + window_size - 1] - samples_sorted[i] |
| 299 | + if window_length < smallest_window_length: |
| 300 | + smallest_window_length = window_length |
| 301 | + smallest_window = window |
| 302 | + return smallest_window[1], smallest_window[0] |
| 303 | + |
| 304 | + |
| 305 | +def _bootstrap_replicate(X, seed=None): |
| 306 | + random_points = sorted(np.random.default_rng(seed).uniform(0, 1, len(X) - 1)) |
| 307 | + random_points.append(1) |
| 308 | + random_points.insert(0, 0) |
| 309 | + gaps = [right - left for left, right in zip(random_points[:-1], random_points[1:])] |
| 310 | + return np.array(gaps) |
0 commit comments