Skip to content

Commit e1b3bde

Browse files
Updates beaker-py version to >2. (#1021)
* Updates beaker-py version. * Fixed secrets ref. * Fixed whoami command * Fixed experimentspec error * Fixed constraints reference. * Update mason.py for beaker-py v2 API changes - Changed beaker_client.workspace.secrets() to beaker_client.secret.list() - Changed beaker_client.account.whoami() to beaker_client.user.get() - Changed beaker.ExperimentSpec to beaker.BeakerExperimentSpec - Changed beaker.Constraints to beaker.BeakerConstraints - Changed beaker.RetrySpec to beaker.BeakerRetrySpec * Fix beaker.TaskSpec to beaker.BeakerTaskSpec for v2 API * Fix remaining beaker v2 API changes - Changed all beaker.EnvVar to beaker.BeakerEnvVar - Changed beaker.DataMount to beaker.BeakerDataMount - Changed beaker.DataSource to beaker.BeakerDataSource - Changed beaker.TaskResources to beaker.BeakerTaskResources - Changed beaker.ImageSource to beaker.BeakerImageSource - Changed beaker.ResultSpec to beaker.BeakerResultSpec - Changed beaker.TaskContext to beaker.BeakerTaskContext - Changed beaker.Priority to beaker.BeakerPriority * Fix BeakerPriority to BeakerJobPriority for v2 API * Fix BeakerJobPriority enum access to use string key * Fix experiment ID access for beaker v2 API The experiment.create() method now returns a BeakerWorkload object, which has an experiment field containing the ID. * Fix beaker v2 API compatibility in utils.py and test_utils.py - Fixed exception names: ConfigurationError → BeakerConfigurationError, ExperimentNotFound → BeakerExperimentNotFound - Updated to use workload.get() and experiment.get_spec() instead of experiment.get() - Changed description update to use workload.update() instead of experiment.set_description() - Updated test mocks to match the new API structure * Updated finetune script * Now, finetune_8b.sh uses uv. * Updated finetune_8b.sh to work with build_image_and_launch.sh. * Updated code * added chat template to script. * changed priority * updated priority * Updated scripts/train/debug/finetune.sh. * added non-resumable flag. * Set description for debug finetune script. * Actually set chat template name. * Updated env var name * updated typo * Updated code * Updated error handling for interactive * Updated script * Refactored finetune.sh
1 parent 5e49954 commit e1b3bde

File tree

10 files changed

+986
-887
lines changed

10 files changed

+986
-887
lines changed

mason.py

Lines changed: 74 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@ def get_args():
159159
)
160160
parser.add_argument(
161161
"--timeout",
162-
type=str,
163-
help="Timeout for the Beaker task (e.g., '2h', '30m', '1d'). If not specified, no timeout is set.",
162+
type=int,
163+
help="Timeout for the Beaker task in seconds (e.g., 7200 for 2 hours). If not specified, no timeout is set.",
164164
default=None,
165165
)
166166
# Split up the mason args from the Python args.
@@ -232,15 +232,15 @@ def get_env_vars(
232232
additional_env_var_names = {var["name"] for var in additional_env_vars}
233233

234234
env_vars = [
235-
beaker.EnvVar(name=name, value=value)
235+
beaker.BeakerEnvVar(name=name, value=value)
236236
for name, value in DEFAULT_ENV_VARS.items()
237237
if name not in additional_env_var_names
238238
]
239239

240-
env_vars.extend([beaker.EnvVar(name=env_var["name"], value=env_var["value"]) for env_var in additional_env_vars])
240+
env_vars.extend([beaker.BeakerEnvVar(name=env_var["name"], value=env_var["value"]) for env_var in additional_env_vars])
241241

242242
# add user-specific secrets
243-
env_vars.extend([beaker.EnvVar(name=secret["name"], secret=secret["value"]) for secret in additional_secrets])
243+
env_vars.extend([beaker.BeakerEnvVar(name=secret["name"], secret=secret["value"]) for secret in additional_secrets])
244244

245245
useful_secrets = [
246246
"HF_TOKEN",
@@ -254,22 +254,24 @@ def get_env_vars(
254254
]
255255
for useful_secret in useful_secrets:
256256
if f"{whoami}_{useful_secret}" in beaker_secrets:
257-
env_vars.append(beaker.EnvVar(name=useful_secret, secret=f"{whoami}_{useful_secret}"))
257+
env_vars.append(beaker.BeakerEnvVar(name=useful_secret, secret=f"{whoami}_{useful_secret}"))
258258
elif useful_secret in beaker_secrets:
259-
env_vars.append(beaker.EnvVar(name=useful_secret, secret=useful_secret))
259+
env_vars.append(beaker.BeakerEnvVar(name=useful_secret, secret=useful_secret))
260260

261261
# use the user's PATH; including the conda / python PATH
262262
if not pure_docker_mode:
263-
env_vars.extend([beaker.EnvVar(name="PATH", value=os.getenv("PATH"))])
263+
env_vars.extend([beaker.BeakerEnvVar(name="PATH", value=os.getenv("PATH"))])
264264

265265
# if all cluster is in weka, we mount the weka
266266
if all(c in WEKA_CLUSTERS for c in cluster):
267267
env_vars.extend(
268268
[
269-
beaker.EnvVar(name="HF_HOME", value="/weka/oe-adapt-default/allennlp/.cache/huggingface"),
270-
beaker.EnvVar(name="HF_DATASETS_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/huggingface"),
271-
beaker.EnvVar(name="HF_HUB_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/hub"),
272-
beaker.EnvVar(
269+
beaker.BeakerEnvVar(name="HF_HOME", value="/weka/oe-adapt-default/allennlp/.cache/huggingface"),
270+
beaker.BeakerEnvVar(
271+
name="HF_DATASETS_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/huggingface"
272+
),
273+
beaker.BeakerEnvVar(name="HF_HUB_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/hub"),
274+
beaker.BeakerEnvVar(
273275
name="CHECKPOINT_OUTPUT_DIR",
274276
value=f"/weka/oe-adapt-default/allennlp/deletable_checkpoint_states/{global_wandb_id}",
275277
),
@@ -278,19 +280,19 @@ def get_env_vars(
278280
if num_nodes > 1:
279281
env_vars.extend(
280282
[
281-
beaker.EnvVar(name="NCCL_SOCKET_IFNAME", value="ib"),
282-
beaker.EnvVar(name="NCCL_IB_HCA", value="^=mlx5_bond_0"),
283+
beaker.BeakerEnvVar(name="NCCL_SOCKET_IFNAME", value="ib"),
284+
beaker.BeakerEnvVar(name="NCCL_IB_HCA", value="^=mlx5_bond_0"),
283285
]
284286
)
285287
# if all cluster is in gcp we add the following env
286288

287289
elif all(c in GCP_CLUSTERS for c in cluster):
288290
env_vars.extend(
289291
[
290-
beaker.EnvVar(name="HF_HOME", value="/filestore/.cache/huggingface"),
291-
beaker.EnvVar(name="HF_DATASETS_CACHE", value="/filestore/.cache/huggingface"),
292-
beaker.EnvVar(name="HF_HUB_CACHE", value="/filestore/.cache/hub"),
293-
beaker.EnvVar(
292+
beaker.BeakerEnvVar(name="HF_HOME", value="/filestore/.cache/huggingface"),
293+
beaker.BeakerEnvVar(name="HF_DATASETS_CACHE", value="/filestore/.cache/huggingface"),
294+
beaker.BeakerEnvVar(name="HF_HUB_CACHE", value="/filestore/.cache/hub"),
295+
beaker.BeakerEnvVar(
294296
name="HF_HUB_ENABLE_HF_TRANSFER",
295297
value="0", # we disable it because GCP is weird on uploading to the hub
296298
),
@@ -299,40 +301,40 @@ def get_env_vars(
299301
if num_nodes > 1:
300302
env_vars.extend(
301303
[
302-
beaker.EnvVar(name="LD_LIBRARY_PATH", value=r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}"),
303-
beaker.EnvVar(name="NCCL_CROSS_NIC", value="0"),
304-
beaker.EnvVar(name="NCCL_ALGO", value="Ring,Tree"),
305-
beaker.EnvVar(name="NCCL_PROTO", value="Simple"),
306-
beaker.EnvVar(name="NCCL_MIN_NCHANNELS", value="4"),
307-
beaker.EnvVar(name="NCCL_P2P_NET_CHUNKSIZE", value="524288"),
308-
beaker.EnvVar(name="NCCL_P2P_PCI_CHUNKSIZE", value="524288"),
309-
beaker.EnvVar(name="NCCL_P2P_NVL_CHUNKSIZE", value="1048576"),
310-
beaker.EnvVar(name="NCCL_FASTRAK_NUM_FLOWS", value="2"),
311-
beaker.EnvVar(name="NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL", value="0"),
312-
beaker.EnvVar(name="NCCL_BUFFSIZE", value="8388608"),
313-
beaker.EnvVar(name="NCCL_FASTRAK_USE_SNAP", value="1"),
314-
beaker.EnvVar(name="CUDA_VISIBLE_DEVICES", value="0,1,2,3,4,5,6,7"),
315-
beaker.EnvVar(name="NCCL_NET_GDR_LEVEL", value="PIX"),
316-
beaker.EnvVar(name="NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING", value="0"),
317-
beaker.EnvVar(name="NCCL_TUNER_PLUGIN", value="libnccl-tuner.so"),
318-
beaker.EnvVar(
304+
beaker.BeakerEnvVar(name="LD_LIBRARY_PATH", value=r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}"),
305+
beaker.BeakerEnvVar(name="NCCL_CROSS_NIC", value="0"),
306+
beaker.BeakerEnvVar(name="NCCL_ALGO", value="Ring,Tree"),
307+
beaker.BeakerEnvVar(name="NCCL_PROTO", value="Simple"),
308+
beaker.BeakerEnvVar(name="NCCL_MIN_NCHANNELS", value="4"),
309+
beaker.BeakerEnvVar(name="NCCL_P2P_NET_CHUNKSIZE", value="524288"),
310+
beaker.BeakerEnvVar(name="NCCL_P2P_PCI_CHUNKSIZE", value="524288"),
311+
beaker.BeakerEnvVar(name="NCCL_P2P_NVL_CHUNKSIZE", value="1048576"),
312+
beaker.BeakerEnvVar(name="NCCL_FASTRAK_NUM_FLOWS", value="2"),
313+
beaker.BeakerEnvVar(name="NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL", value="0"),
314+
beaker.BeakerEnvVar(name="NCCL_BUFFSIZE", value="8388608"),
315+
beaker.BeakerEnvVar(name="NCCL_FASTRAK_USE_SNAP", value="1"),
316+
beaker.BeakerEnvVar(name="CUDA_VISIBLE_DEVICES", value="0,1,2,3,4,5,6,7"),
317+
beaker.BeakerEnvVar(name="NCCL_NET_GDR_LEVEL", value="PIX"),
318+
beaker.BeakerEnvVar(name="NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING", value="0"),
319+
beaker.BeakerEnvVar(name="NCCL_TUNER_PLUGIN", value="libnccl-tuner.so"),
320+
beaker.BeakerEnvVar(
319321
name="NCCL_TUNER_CONFIG_PATH", value="/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto"
320322
),
321-
beaker.EnvVar(
323+
beaker.BeakerEnvVar(
322324
name="NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE",
323325
value="/var/lib/tcpxo/lib64/a3plus_guest_config.textproto",
324326
),
325-
beaker.EnvVar(name="NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", value="600000"),
326-
beaker.EnvVar(name="NCCL_NVLS_ENABLE", value="0"),
327-
beaker.EnvVar(name="NCCL_FASTRAK_CTRL_DEV", value="enp0s12"),
328-
beaker.EnvVar(
327+
beaker.BeakerEnvVar(name="NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", value="600000"),
328+
beaker.BeakerEnvVar(name="NCCL_NVLS_ENABLE", value="0"),
329+
beaker.BeakerEnvVar(name="NCCL_FASTRAK_CTRL_DEV", value="enp0s12"),
330+
beaker.BeakerEnvVar(
329331
name="NCCL_FASTRAK_IFNAME",
330332
value="enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0",
331333
),
332-
beaker.EnvVar(name="NCCL_SOCKET_IFNAME", value="enp0s12"),
333-
beaker.EnvVar(name="NCCL_USE_SNAP", value="1"),
334-
beaker.EnvVar(name="NCCL_FASTRAK_USE_LLCM", value="1"),
335-
beaker.EnvVar(name="NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY", value="/dev/aperture_devices"),
334+
beaker.BeakerEnvVar(name="NCCL_SOCKET_IFNAME", value="enp0s12"),
335+
beaker.BeakerEnvVar(name="NCCL_USE_SNAP", value="1"),
336+
beaker.BeakerEnvVar(name="NCCL_FASTRAK_USE_LLCM", value="1"),
337+
beaker.BeakerEnvVar(name="NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY", value="/dev/aperture_devices"),
336338
]
337339
)
338340
# don't mount anything; assume no cache
@@ -342,8 +344,8 @@ def get_env_vars(
342344
if resumable:
343345
env_vars.extend(
344346
[
345-
beaker.EnvVar(name="WANDB_RUN_ID", value=global_wandb_id),
346-
beaker.EnvVar(name="WANDB_RESUME", value="allow"),
347+
beaker.BeakerEnvVar(name="WANDB_RUN_ID", value=global_wandb_id),
348+
beaker.BeakerEnvVar(name="WANDB_RESUME", value="allow"),
347349
]
348350
)
349351

@@ -356,16 +358,22 @@ def get_datasets(beaker_datasets, cluster: List[str]):
356358
# if all cluster is in weka, we mount the weka
357359
if all(c in WEKA_CLUSTERS for c in cluster):
358360
res = [
359-
beaker.DataMount(source=beaker.DataSource(weka="oe-adapt-default"), mount_path="/weka/oe-adapt-default"),
360-
beaker.DataMount(
361-
source=beaker.DataSource(weka="oe-training-default"), mount_path="/weka/oe-training-default"
361+
beaker.BeakerDataMount(
362+
source=beaker.BeakerDataSource(weka="oe-adapt-default"), mount_path="/weka/oe-adapt-default"
363+
),
364+
beaker.BeakerDataMount(
365+
source=beaker.BeakerDataSource(weka="oe-training-default"), mount_path="/weka/oe-training-default"
362366
),
363367
]
364368
elif all(c in GCP_CLUSTERS for c in cluster):
365-
res = [beaker.DataMount(source=beaker.DataSource(host_path="/mnt/filestore_1"), mount_path="/filestore")]
369+
res = [
370+
beaker.BeakerDataMount(
371+
source=beaker.BeakerDataSource(host_path="/mnt/filestore_1"), mount_path="/filestore"
372+
)
373+
]
366374
for beaker_dataset in beaker_datasets:
367-
to_append = beaker.DataMount(
368-
source=beaker.DataSource(beaker=beaker_dataset["beaker"]), mount_path=beaker_dataset["mount_path"]
375+
to_append = beaker.BeakerDataMount(
376+
source=beaker.BeakerDataSource(beaker=beaker_dataset["beaker"]), mount_path=beaker_dataset["mount_path"]
369377
)
370378
res.append(to_append)
371379

@@ -716,17 +724,19 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami:
716724
raise ValueError("GCP clusters do not have the dev filesystem, please use a proper image")
717725

718726
if args.hostname is not None:
719-
constraints = beaker.Constraints(hostname=args.hostname)
727+
constraints = beaker.BeakerConstraints(hostname=args.hostname)
720728
else:
721-
constraints = beaker.Constraints(cluster=args.cluster)
722-
spec = beaker.TaskSpec(
729+
constraints = beaker.BeakerConstraints(cluster=args.cluster)
730+
spec = beaker.BeakerTaskSpec(
723731
name=f"{args.task_name}__{i}",
724-
image=beaker.ImageSource(beaker=args.image),
732+
image=beaker.BeakerImageSource(beaker=args.image),
725733
command=["/bin/bash", "-c"],
726734
arguments=[full_command],
727-
result=beaker.ResultSpec(path="/output"),
735+
result=beaker.BeakerResultSpec(path="/output"),
728736
datasets=get_datasets(args.beaker_datasets, args.cluster),
729-
context=beaker.TaskContext(priority=beaker.Priority(args.priority), preemptible=args.preemptible),
737+
context=beaker.BeakerTaskContext(
738+
priority=beaker.BeakerJobPriority[args.priority], preemptible=args.preemptible
739+
),
730740
constraints=constraints,
731741
env_vars=get_env_vars(
732742
args.pure_docker_mode,
@@ -738,7 +748,7 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami:
738748
args.env,
739749
args.secret,
740750
),
741-
resources=beaker.TaskResources(gpu_count=args.gpus, shared_memory=args.shared_memory),
751+
resources=beaker.BeakerTaskResources(gpu_count=args.gpus, shared_memory=args.shared_memory),
742752
replicas=args.num_nodes,
743753
)
744754
if args.num_nodes > 1:
@@ -769,8 +779,8 @@ def main():
769779
beaker_client = beaker.Beaker.from_env(default_workspace=args.workspace)
770780
else:
771781
beaker_client = beaker.Beaker.from_env()
772-
beaker_secrets = [secret.name for secret in beaker_client.workspace.secrets()]
773-
whoami = beaker_client.account.whoami().name
782+
beaker_secrets = [secret.name for secret in beaker_client.secret.list()]
783+
whoami = beaker_client.user.get().name
774784

775785
full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands]
776786
if is_external_user:
@@ -789,17 +799,17 @@ def main():
789799
console.print(Text(full_command))
790800
if is_external_user:
791801
return
792-
experiment_spec = beaker.ExperimentSpec(
802+
experiment_spec = beaker.BeakerExperimentSpec(
793803
description=args.description,
794804
tasks=[
795805
make_task_spec(args, full_command, i, beaker_secrets, whoami, args.resumable)
796806
for i, full_command in enumerate(full_commands)
797807
],
798808
budget=args.budget,
799-
retry=beaker.RetrySpec(allowed_task_retries=args.max_retries),
809+
retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries),
800810
)
801811
exp = beaker_client.experiment.create(spec=experiment_spec)
802-
console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.id}")
812+
console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}")
803813

804814

805815
if __name__ == "__main__":

open_instruct/test_utils.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,16 +85,22 @@ def _setup_beaker_mocks(mock_beaker_from_env, mock_is_beaker_job, initial_descri
8585
mock_client = mock.MagicMock()
8686
mock_beaker_from_env.return_value = mock_client
8787

88+
# Mock the workload object
89+
mock_workload = mock.MagicMock()
90+
mock_client.workload.get.return_value = mock_workload
91+
92+
# Mock the spec object returned by experiment.get_spec
8893
mock_spec = mock.MagicMock()
8994
mock_spec.description = initial_description
90-
mock_client.experiment.get.return_value = mock_spec
95+
mock_client.experiment.get_spec.return_value = mock_spec
9196

9297
description_history = []
9398

94-
def track_description(exp_id, desc):
95-
description_history.append(desc)
99+
def track_description(workload, description=None):
100+
if description is not None:
101+
description_history.append(description)
96102

97-
mock_client.experiment.set_description.side_effect = track_description
103+
mock_client.workload.update.side_effect = track_description
98104

99105
return mock_client, mock_spec, description_history
100106

open_instruct/utils.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -978,13 +978,16 @@ def maybe_update_beaker_description(
978978

979979
try:
980980
client = beaker.Beaker.from_env()
981-
except beaker.exceptions.ConfigurationError as e:
981+
except beaker.exceptions.BeakerConfigurationError as e:
982982
logger.warning(f"Failed to initialize Beaker client: {e}")
983983
return
984984

985985
try:
986-
spec = client.experiment.get(experiment_id)
987-
except beaker.exceptions.ExperimentNotFound:
986+
# Get the workload first (experiment_id is actually BEAKER_WORKLOAD_ID)
987+
workload = client.workload.get(experiment_id)
988+
# Then get the experiment spec from the workload
989+
spec = client.experiment.get_spec(workload)
990+
except (beaker.exceptions.BeakerExperimentNotFound, ValueError):
988991
logger.warning(
989992
f"Failed to get Beaker experiment with ID: {experiment_id}"
990993
"This might be fine if you are e.g. running in an interactive job."
@@ -1025,7 +1028,8 @@ def maybe_update_beaker_description(
10251028
description_components.append(progress_bar)
10261029
new_description = " ".join(description_components)
10271030
try:
1028-
client.experiment.set_description(experiment_id, new_description)
1031+
# Update the workload description using the workload object we got earlier
1032+
client.workload.update(workload, description=new_description)
10291033
except requests.exceptions.HTTPError as e:
10301034
logger.warning(
10311035
f"Failed to update Beaker description due to HTTP error: {e}"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ link-mode = "hardlink"
7676

7777
[dependency-groups]
7878
dev = [
79-
"beaker-py>=1.32.2,<2.0",
79+
"beaker-py>=2.5.0",
8080
"mkdocs-material>=9.6.8",
8181
"markdown-include>=0.8.1",
8282
"pytest>=8.3.4",

scripts/train/debug/finetune.sh

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
uv run accelerate launch \
1+
#!/bin/bash
2+
3+
LAUNCH_CMD="accelerate launch \
24
--mixed_precision bf16 \
35
--num_processes 1 \
46
open_instruct/finetune.py \
@@ -12,11 +14,34 @@ uv run accelerate launch \
1214
--warmup_ratio 0.03 \
1315
--weight_decay 0.0 \
1416
--num_train_epochs 2 \
15-
--output_dir output/ \
1617
--report_to wandb \
1718
--logging_steps 1 \
1819
--model_revision main \
1920
--dataset_mixer_list allenai/tulu-3-sft-personas-algebra 100 \
2021
--add_bos \
2122
--seed 123 \
22-
# --with_tracking \
23+
--chat_template_name tulu \
24+
--with_tracking"
25+
26+
if [ -n "$1" ]; then
27+
BEAKER_IMAGE="$1"
28+
echo "Using Beaker image: $BEAKER_IMAGE"
29+
30+
uv run python mason.py \
31+
--cluster ai2/jupiter \
32+
--workspace ai2/open-instruct-dev \
33+
--priority normal \
34+
--image "$BEAKER_IMAGE" \
35+
--description "Single GPU finetune job." \
36+
--pure_docker_mode \
37+
--preemptible \
38+
--num_nodes 1 \
39+
--budget ai2/oe-adapt \
40+
--gpus 1 \
41+
--non_resumable \
42+
-- \
43+
$LAUNCH_CMD
44+
else
45+
echo "Running locally..."
46+
uv run $LAUNCH_CMD
47+
fi

scripts/train/debug/large_test_script.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ uv run python mason.py \
1313
--preemptible \
1414
--num_nodes 2 \
1515
--description "Large (multi-node) test script." \
16-
--timeout "1h" \
16+
--timeout 3600 \
1717
--max_retries 0 \
1818
--env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
1919
--budget ai2/oe-adapt \

scripts/train/debug/single_gpu_on_beaker.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ uv run python mason.py \
1717
--priority urgent \
1818
--num_nodes 1 \
1919
--max_retries 0 \
20-
--timeout "15m" \
20+
--timeout 900 \
2121
--env VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
2222
--budget ai2/oe-adapt \
2323
--gpus 1 \

0 commit comments

Comments
 (0)