Skip to content

Commit 0675d82

Browse files
committed
helper functions
1 parent f880cad commit 0675d82

File tree

7 files changed

+398
-0
lines changed

7 files changed

+398
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"runtime, ave":{"ozs large":2.41,"hard large":3.25,"soft large":17.41},"runtime, std dev":{"ozs large":0.08,"hard large":0.09,"soft large":0.5},"ESS_bulk\/s":{"ozs large":1305.86,"hard large":981.53,"soft large":181.15}}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"runtime, ave":{"ozs small":2.21,"hard small":3.07,"soft small":54.49},"runtime, std dev":{"ozs small":0.05,"hard small":0.08,"soft small":2.54},"ESS_bulk\/s":{"ozs small":1327.11,"hard small":988.47,"soft small":63.14}}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
# Compare ESS/sec using binomial model
2+
# and simulated datasets based on smaller, larger number of observations.
3+
import os
4+
import numpy as np
5+
import pandas as pd
6+
from typing import Any, Dict, List, Tuple
7+
from cmdstanpy import CmdStanModel, set_cmdstan_path
8+
9+
10+
import logging
11+
cmdstanpy_logger = logging.getLogger("cmdstanpy")
12+
cmdstanpy_logger.setLevel(logging.FATAL)
13+
14+
import warnings
15+
warnings.filterwarnings('ignore')
16+
17+
set_cmdstan_path(os.path.join('/Users', 'mitzi', 'github', 'stan-dev', 'cmdstan'))
18+
19+
from utils import simulate_data
20+
21+
# Fit model and dataset for N iterations.
22+
# For each run, save wall clock time and effective samples / second (N_Eff/sec)
23+
# Return np.ndarray of size (N,2) with timing information.
24+
def time_fits(N: int, model: CmdStanModel, data: dict) -> np.ndarray:
25+
fit_times = np.ndarray(shape=(N, 2), dtype=float)
26+
for i in range(N):
27+
print('Run', i)
28+
fit = model.sample(data=data, parallel_chains=4,
29+
show_progress=False, show_console=False, refresh=10_000)
30+
fit_summary = fit.summary()
31+
total_time = 0
32+
times = fit.time
33+
for j in range(len(times)):
34+
total_time += times[j]['total']
35+
36+
fit_times[i, 0] = total_time
37+
fit_times[i, 1] = fit_summary.loc['lp__', 'ESS_bulk/s']
38+
return fit_times
39+
40+
41+
# Given a list of label, time pairs, populate dataframe
42+
# of means of time and std dev wall clock time, and N_Eff/sec
43+
def summarize_times(data_pairs: List[Tuple[str, np.ndarray]]) -> pd.DataFrame:
44+
result_data = []
45+
for label, array in data_pairs:
46+
result_data.append({
47+
'label': label,
48+
'mean': np.mean(array, axis=0)[0],
49+
'std dev': np.std(array, axis=0)[0],
50+
'ESS_bulk/s': np.mean(array, axis=0)[1]
51+
})
52+
df = pd.DataFrame(result_data)
53+
return df.set_index('label').round(2)
54+
55+
56+
# Create datasets - fix sizes, and seed
57+
N_eth = 3
58+
N_edu = 5
59+
N_age = 9
60+
baseline = -3.5
61+
sens = 0.75
62+
spec = 0.9995
63+
data_small = simulate_data(N_eth, N_edu, N_age, baseline, sens, spec, 17, seed=45678)
64+
data_large = simulate_data(N_eth, N_edu, N_age, baseline, sens, spec, 200, seed=45678)
65+
66+
# sum to zero vector
67+
68+
binomial_ozs_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_ozs.stan'))
69+
times_ozs_large = time_fits(100, binomial_ozs_mod, data_large)
70+
times_ozs_small = time_fits(100, binomial_ozs_mod, data_small)
71+
72+
# hard sum-to-zero constraint
73+
74+
binomial_hard_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_hard.stan'))
75+
times_hard_small = time_fits(100, binomial_hard_mod, data_small)
76+
times_hard_large = time_fits(100, binomial_hard_mod, data_large)
77+
78+
# soft sum-to-zero constraint
79+
80+
binomial_soft_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_soft.stan'))
81+
times_soft_small = time_fits(100, binomial_soft_mod, data_small)
82+
times_soft_large = time_fits(100, binomial_soft_mod, data_large)
83+
84+
85+
df_small = summarize_times([('ozs small', times_ozs_small),
86+
('hard small', times_hard_small),
87+
('soft small', times_soft_small)])
88+
df_small.to_json("binomial_runtimes_small.json")
89+
90+
df_large = summarize_times([('ozs large', times_ozs_large),
91+
('hard large', times_hard_large),
92+
('soft large', times_soft_large)])
93+
df_large.to_json("binomial_runtimes_large.json")
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import os
2+
import numpy as np
3+
from cmdstanpy import CmdStanModel
4+
from typing import Any, Dict
5+
6+
def simulate_data(
7+
N_eth: int, N_edu: int, N_age: int,
8+
baseline: float, sens: float, spec: float,
9+
N_obs_per_stratum: int, *, seed: int = 345678
10+
) -> Dict[str, Any]:
11+
12+
N = 2 * N_eth * N_edu * N_age * N_obs_per_stratum
13+
14+
inputs_dict = {
15+
'N': N, 'N_eth': N_eth, 'N_edu': N_edu, 'N_age': N_age,
16+
'baseline': baseline, 'sens': sens, 'spec': spec
17+
}
18+
datagen_mod = CmdStanModel(stan_file=os.path.join('stan', 'gen_binomial_4_preds.stan'))
19+
sim_data = datagen_mod.sample(data=inputs_dict, iter_warmup=1, iter_sampling=1, chains=1,
20+
show_progress=False, show_console=False, refresh=10_000,
21+
seed=45678)
22+
gen_data = {
23+
'N':sim_data.pos_tests.shape[1],
24+
'N_age':N_age,
25+
'N_eth':N_eth,
26+
'N_edu':N_edu,
27+
'sens': sens,
28+
'spec': spec,
29+
'intercept_prior_mean': baseline,
30+
'intercept_prior_scale': 2.5,
31+
'pos_tests':sim_data.pos_tests[0].astype(int),
32+
'tests':sim_data.tests[0].astype(int),
33+
'sex':sim_data.sex[0].astype(int),
34+
'age':sim_data.age[0].astype(int),
35+
'eth':sim_data.eth[0].astype(int),
36+
'edu':sim_data.edu[0].astype(int),
37+
'beta_0': sim_data.beta_0[0],
38+
'pct_sex': sim_data.pct_sex[0],
39+
'beta_sex': sim_data.beta_sex[0],
40+
'pct_age': sim_data.pct_age[0],
41+
'beta_age':sim_data.beta_age[0],
42+
'pct_eth': sim_data.pct_eth[0],
43+
'beta_eth':sim_data.beta_eth[0],
44+
'pct_edu': sim_data.pct_edu[0],
45+
'beta_edu':sim_data.beta_edu[0],
46+
'seed':sim_data.metadata.cmdstan_config['seed'],
47+
}
48+
return gen_data
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
# Compare ESS/sec using binomial model
2+
# and simulated datasets based on smaller, larger number of observations.
3+
4+
5+
import os
6+
import numpy as np
7+
import pandas as pd
8+
from typing import Any, Dict, List, Tuple
9+
from cmdstanpy import CmdStanModel
10+
11+
import logging
12+
cmdstanpy_logger = logging.getLogger("cmdstanpy")
13+
cmdstanpy_logger.setLevel(logging.ERROR)
14+
15+
import warnings
16+
warnings.filterwarnings('ignore')
17+
18+
# Fit model and dataset for N iterations.
19+
# For each run, save wall clock time and effective samples / second (N_Eff/sec)
20+
# Return np.ndarray of size (N,2) with timing information.
21+
def time_fits(N: int, model: CmdStanModel, data: dict) -> np.ndarray:
22+
fit_times = np.ndarray(shape=(N, 2), dtype=float)
23+
for i in range(N):
24+
fit = model.sample(data=data, parallel_chains=4,
25+
show_progress=False, show_console=False, refresh=10_000)
26+
times = fit.time
27+
fit_summary = fit.summary()
28+
total_time = 0
29+
for j in range(len(times)):
30+
total_time += times[j]['total']
31+
32+
fit_times[i, 0] = total_time
33+
fit_times[i, 1] = fit_summary.loc['lp__', 'ESS_bulk']/total_time
34+
return fit_times
35+
36+
37+
# Given a list of label, time pairs, populate dataframe
38+
# of means of time and std dev wall clock time, and N_Eff/sec
39+
def summarize_times(data_pairs: List[Tuple[str, np.ndarray]]) -> pd.DataFrame:
40+
result_data = []
41+
for label, array in data_pairs:
42+
result_data.append({
43+
'label': label,
44+
'mean': np.mean(array, axis=0)[0],
45+
'std dev': np.std(array, axis=0)[0],
46+
'ESS/sec': np.mean(array, axis=0)[1]
47+
})
48+
df = pd.DataFrame(result_data)
49+
return df.set_index('label').round(2)
50+
51+
52+
# Runs data-generating model and returns a Dict containing simulated data and metadata.
53+
def simulate_data(
54+
N_eth: int, N_edu: int, N_age: int,
55+
baseline: float, sens: float, spec: float,
56+
N_obs_per_stratum: int, *, seed: int = 345678
57+
) -> Dict[str, Any]:
58+
59+
N = 2 * N_eth * N_edu * N_age * N_obs_per_stratum
60+
61+
inputs_dict = {
62+
'N': N, 'N_eth': N_eth, 'N_edu': N_edu, 'N_age': N_age,
63+
'baseline': baseline, 'sens': sens, 'spec': spec
64+
}
65+
datagen_mod = CmdStanModel(stan_file=os.path.join('stan', 'gen_binomial_4_preds.stan'))
66+
sim_data = datagen_mod.sample(data=inputs_dict, iter_warmup=1, iter_sampling=1, chains=1,
67+
show_progress=False, show_console=False, refresh=10_000,
68+
seed=45678)
69+
gen_data = {
70+
'N':sim_data.pos_tests.shape[1],
71+
'N_age':N_age,
72+
'N_eth':N_eth,
73+
'N_edu':N_edu,
74+
'sens': sens,
75+
'spec': spec,
76+
'intercept_prior_mean': baseline,
77+
'intercept_prior_scale': 2.5,
78+
'pos_tests':sim_data.pos_tests[0].astype(int),
79+
'tests':sim_data.tests[0].astype(int),
80+
'sex':sim_data.sex[0].astype(int),
81+
'age':sim_data.age[0].astype(int),
82+
'eth':sim_data.eth[0].astype(int),
83+
'edu':sim_data.edu[0].astype(int),
84+
'beta_0': sim_data.beta_0[0],
85+
'pct_sex': sim_data.pct_sex[0],
86+
'beta_sex': sim_data.beta_sex[0],
87+
'pct_age': sim_data.pct_age[0],
88+
'beta_age':sim_data.beta_age[0],
89+
'pct_eth': sim_data.pct_eth[0],
90+
'beta_eth':sim_data.beta_eth[0],
91+
'pct_edu': sim_data.pct_edu[0],
92+
'beta_edu':sim_data.beta_edu[0],
93+
'seed':sim_data.metadata.cmdstan_config['seed'],
94+
}
95+
return gen_data
96+
97+
98+
# Create a dataset - fix sizes, and seed
99+
100+
N_eth = 3
101+
N_edu = 5
102+
N_age = 9
103+
baseline = -3.5
104+
sens = 0.75
105+
spec = 0.9995
106+
data_tiny = simulate_data(N_eth, N_edu, N_age, baseline, sens, spec, 7, seed=45678)
107+
data_small = simulate_data(N_eth, N_edu, N_age, baseline, sens, spec, 17, seed=data_tiny['seed'])
108+
data_large = simulate_data(N_eth, N_edu, N_age, baseline, sens, spec, 200, seed=data_tiny['seed'])
109+
110+
111+
# sum to zero vector
112+
113+
binomial_ozs_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_ozs.stan'))
114+
times_ozs_large = time_fits(100, binomial_ozs_mod, data_large)
115+
times_ozs_small = time_fits(100, binomial_ozs_mod, data_small)
116+
117+
# sum to zero vector
118+
119+
binomial_hard_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_hard.stan'))
120+
times_hard_small = time_fits(100, binomial_hard_mod, data_small)
121+
times_hard_large = time_fits(100, binomial_hard_mod, data_large)
122+
123+
124+
# soft sum-to-zero constraint
125+
126+
binomial_soft_mod = CmdStanModel(stan_file=os.path.join('stan', 'binomial_4preds_soft.stan'))
127+
times_soft_small = time_fits(100, binomial_soft_mod, data_small)
128+
times_soft_large = time_fits(100, binomial_soft_mod, data_large)
129+
130+
131+
df_summary = summarize_times([('ozs small', times_ozs_small),
132+
('ozs large', times_ozs_large),
133+
('hard small', times_hard_small),
134+
('hard large', times_hard_large),
135+
('soft small', times_soft_small),
136+
('soft large', times_soft_large)])
137+
df_summary.to_json("binomial_runtimes.json")
138+

jupyter/sum-to-zero/utils.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import os
2+
import os.path
3+
import logging
4+
import numpy as np
5+
import pandas as pd
6+
7+
from typing import Any, Dict
8+
from cmdstanpy import CmdStanModel
9+
10+
cmdstanpy_logger = logging.getLogger("cmdstanpy")
11+
cmdstanpy_logger.setLevel(logging.ERROR)
12+
13+
import warnings
14+
warnings.filterwarnings('ignore')
15+
16+
17+
# Runs data-generating model and returns a Dict containing simulated data and metadata.
18+
def simulate_data(
19+
N_eth: int, N_edu: int, N_age: int,
20+
baseline: float, sens: float, spec: float,
21+
N_obs_per_stratum: int, *, seed: int = 345678
22+
) -> Dict[str, Any]:
23+
N = 2 * N_eth * N_edu * N_age * N_obs_per_stratum
24+
inputs_dict = {
25+
'N': N, 'N_eth': N_eth, 'N_edu': N_edu, 'N_age': N_age,
26+
'baseline': baseline, 'sens': sens, 'spec': spec
27+
}
28+
datagen_mod = CmdStanModel(stan_file=os.path.join('stan', 'gen_binomial_4_preds.stan'))
29+
sim_data = datagen_mod.sample(data=inputs_dict, iter_warmup=1, iter_sampling=1, chains=1,
30+
show_progress=False, show_console=False, refresh=10_000,
31+
seed=45678)
32+
gen_data = {
33+
'N':sim_data.pos_tests.shape[1],
34+
'N_age':N_age,
35+
'N_eth':N_eth,
36+
'N_edu':N_edu,
37+
'sens': sens,
38+
'spec': spec,
39+
'intercept_prior_mean': baseline,
40+
'intercept_prior_scale': 2.5,
41+
'pos_tests':sim_data.pos_tests[0].astype(int),
42+
'tests':sim_data.tests[0].astype(int),
43+
'sex':sim_data.sex[0].astype(int),
44+
'age':sim_data.age[0].astype(int),
45+
'eth':sim_data.eth[0].astype(int),
46+
'edu':sim_data.edu[0].astype(int),
47+
'beta_0': sim_data.beta_0[0],
48+
'pct_sex': sim_data.pct_sex[0],
49+
'beta_sex': sim_data.beta_sex[0],
50+
'pct_age': sim_data.pct_age[0],
51+
'beta_age':sim_data.beta_age[0],
52+
'pct_eth': sim_data.pct_eth[0],
53+
'beta_eth':sim_data.beta_eth[0],
54+
'pct_edu': sim_data.pct_edu[0],
55+
'beta_edu':sim_data.beta_edu[0],
56+
'seed':sim_data.metadata.cmdstan_config['seed'],
57+
}
58+
return gen_data
59+
60+

0 commit comments

Comments
 (0)