@@ -159,8 +159,8 @@ def get_args():
159159 )
160160 parser .add_argument (
161161 "--timeout" ,
162- type = str ,
163- help = "Timeout for the Beaker task (e.g., '2h', '30m', '1d' ). If not specified, no timeout is set." ,
162+ type = int ,
163+ help = "Timeout for the Beaker task in seconds (e.g., 7200 for 2 hours ). If not specified, no timeout is set." ,
164164 default = None ,
165165 )
166166 # Split up the mason args from the Python args.
@@ -232,15 +232,15 @@ def get_env_vars(
232232 additional_env_var_names = {var ["name" ] for var in additional_env_vars }
233233
234234 env_vars = [
235- beaker .EnvVar (name = name , value = value )
235+ beaker .BeakerEnvVar (name = name , value = value )
236236 for name , value in DEFAULT_ENV_VARS .items ()
237237 if name not in additional_env_var_names
238238 ]
239239
240- env_vars .extend ([beaker .EnvVar (name = env_var ["name" ], value = env_var ["value" ]) for env_var in additional_env_vars ])
240+ env_vars .extend ([beaker .BeakerEnvVar (name = env_var ["name" ], value = env_var ["value" ]) for env_var in additional_env_vars ])
241241
242242 # add user-specific secrets
243- env_vars .extend ([beaker .EnvVar (name = secret ["name" ], secret = secret ["value" ]) for secret in additional_secrets ])
243+ env_vars .extend ([beaker .BeakerEnvVar (name = secret ["name" ], secret = secret ["value" ]) for secret in additional_secrets ])
244244
245245 useful_secrets = [
246246 "HF_TOKEN" ,
@@ -254,22 +254,24 @@ def get_env_vars(
254254 ]
255255 for useful_secret in useful_secrets :
256256 if f"{ whoami } _{ useful_secret } " in beaker_secrets :
257- env_vars .append (beaker .EnvVar (name = useful_secret , secret = f"{ whoami } _{ useful_secret } " ))
257+ env_vars .append (beaker .BeakerEnvVar (name = useful_secret , secret = f"{ whoami } _{ useful_secret } " ))
258258 elif useful_secret in beaker_secrets :
259- env_vars .append (beaker .EnvVar (name = useful_secret , secret = useful_secret ))
259+ env_vars .append (beaker .BeakerEnvVar (name = useful_secret , secret = useful_secret ))
260260
261261 # use the user's PATH; including the conda / python PATH
262262 if not pure_docker_mode :
263- env_vars .extend ([beaker .EnvVar (name = "PATH" , value = os .getenv ("PATH" ))])
263+ env_vars .extend ([beaker .BeakerEnvVar (name = "PATH" , value = os .getenv ("PATH" ))])
264264
265265 # if all cluster is in weka, we mount the weka
266266 if all (c in WEKA_CLUSTERS for c in cluster ):
267267 env_vars .extend (
268268 [
269- beaker .EnvVar (name = "HF_HOME" , value = "/weka/oe-adapt-default/allennlp/.cache/huggingface" ),
270- beaker .EnvVar (name = "HF_DATASETS_CACHE" , value = "/weka/oe-adapt-default/allennlp/.cache/huggingface" ),
271- beaker .EnvVar (name = "HF_HUB_CACHE" , value = "/weka/oe-adapt-default/allennlp/.cache/hub" ),
272- beaker .EnvVar (
269+ beaker .BeakerEnvVar (name = "HF_HOME" , value = "/weka/oe-adapt-default/allennlp/.cache/huggingface" ),
270+ beaker .BeakerEnvVar (
271+ name = "HF_DATASETS_CACHE" , value = "/weka/oe-adapt-default/allennlp/.cache/huggingface"
272+ ),
273+ beaker .BeakerEnvVar (name = "HF_HUB_CACHE" , value = "/weka/oe-adapt-default/allennlp/.cache/hub" ),
274+ beaker .BeakerEnvVar (
273275 name = "CHECKPOINT_OUTPUT_DIR" ,
274276 value = f"/weka/oe-adapt-default/allennlp/deletable_checkpoint_states/{ global_wandb_id } " ,
275277 ),
@@ -278,19 +280,19 @@ def get_env_vars(
278280 if num_nodes > 1 :
279281 env_vars .extend (
280282 [
281- beaker .EnvVar (name = "NCCL_SOCKET_IFNAME" , value = "ib" ),
282- beaker .EnvVar (name = "NCCL_IB_HCA" , value = "^=mlx5_bond_0" ),
283+ beaker .BeakerEnvVar (name = "NCCL_SOCKET_IFNAME" , value = "ib" ),
284+ beaker .BeakerEnvVar (name = "NCCL_IB_HCA" , value = "^=mlx5_bond_0" ),
283285 ]
284286 )
285287 # if all cluster is in gcp we add the following env
286288
287289 elif all (c in GCP_CLUSTERS for c in cluster ):
288290 env_vars .extend (
289291 [
290- beaker .EnvVar (name = "HF_HOME" , value = "/filestore/.cache/huggingface" ),
291- beaker .EnvVar (name = "HF_DATASETS_CACHE" , value = "/filestore/.cache/huggingface" ),
292- beaker .EnvVar (name = "HF_HUB_CACHE" , value = "/filestore/.cache/hub" ),
293- beaker .EnvVar (
292+ beaker .BeakerEnvVar (name = "HF_HOME" , value = "/filestore/.cache/huggingface" ),
293+ beaker .BeakerEnvVar (name = "HF_DATASETS_CACHE" , value = "/filestore/.cache/huggingface" ),
294+ beaker .BeakerEnvVar (name = "HF_HUB_CACHE" , value = "/filestore/.cache/hub" ),
295+ beaker .BeakerEnvVar (
294296 name = "HF_HUB_ENABLE_HF_TRANSFER" ,
295297 value = "0" , # we disable it because GCP is weird on uploading to the hub
296298 ),
@@ -299,40 +301,40 @@ def get_env_vars(
299301 if num_nodes > 1 :
300302 env_vars .extend (
301303 [
302- beaker .EnvVar (name = "LD_LIBRARY_PATH" , value = r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}" ),
303- beaker .EnvVar (name = "NCCL_CROSS_NIC" , value = "0" ),
304- beaker .EnvVar (name = "NCCL_ALGO" , value = "Ring,Tree" ),
305- beaker .EnvVar (name = "NCCL_PROTO" , value = "Simple" ),
306- beaker .EnvVar (name = "NCCL_MIN_NCHANNELS" , value = "4" ),
307- beaker .EnvVar (name = "NCCL_P2P_NET_CHUNKSIZE" , value = "524288" ),
308- beaker .EnvVar (name = "NCCL_P2P_PCI_CHUNKSIZE" , value = "524288" ),
309- beaker .EnvVar (name = "NCCL_P2P_NVL_CHUNKSIZE" , value = "1048576" ),
310- beaker .EnvVar (name = "NCCL_FASTRAK_NUM_FLOWS" , value = "2" ),
311- beaker .EnvVar (name = "NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL" , value = "0" ),
312- beaker .EnvVar (name = "NCCL_BUFFSIZE" , value = "8388608" ),
313- beaker .EnvVar (name = "NCCL_FASTRAK_USE_SNAP" , value = "1" ),
314- beaker .EnvVar (name = "CUDA_VISIBLE_DEVICES" , value = "0,1,2,3,4,5,6,7" ),
315- beaker .EnvVar (name = "NCCL_NET_GDR_LEVEL" , value = "PIX" ),
316- beaker .EnvVar (name = "NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING" , value = "0" ),
317- beaker .EnvVar (name = "NCCL_TUNER_PLUGIN" , value = "libnccl-tuner.so" ),
318- beaker .EnvVar (
304+ beaker .BeakerEnvVar (name = "LD_LIBRARY_PATH" , value = r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}" ),
305+ beaker .BeakerEnvVar (name = "NCCL_CROSS_NIC" , value = "0" ),
306+ beaker .BeakerEnvVar (name = "NCCL_ALGO" , value = "Ring,Tree" ),
307+ beaker .BeakerEnvVar (name = "NCCL_PROTO" , value = "Simple" ),
308+ beaker .BeakerEnvVar (name = "NCCL_MIN_NCHANNELS" , value = "4" ),
309+ beaker .BeakerEnvVar (name = "NCCL_P2P_NET_CHUNKSIZE" , value = "524288" ),
310+ beaker .BeakerEnvVar (name = "NCCL_P2P_PCI_CHUNKSIZE" , value = "524288" ),
311+ beaker .BeakerEnvVar (name = "NCCL_P2P_NVL_CHUNKSIZE" , value = "1048576" ),
312+ beaker .BeakerEnvVar (name = "NCCL_FASTRAK_NUM_FLOWS" , value = "2" ),
313+ beaker .BeakerEnvVar (name = "NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL" , value = "0" ),
314+ beaker .BeakerEnvVar (name = "NCCL_BUFFSIZE" , value = "8388608" ),
315+ beaker .BeakerEnvVar (name = "NCCL_FASTRAK_USE_SNAP" , value = "1" ),
316+ beaker .BeakerEnvVar (name = "CUDA_VISIBLE_DEVICES" , value = "0,1,2,3,4,5,6,7" ),
317+ beaker .BeakerEnvVar (name = "NCCL_NET_GDR_LEVEL" , value = "PIX" ),
318+ beaker .BeakerEnvVar (name = "NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING" , value = "0" ),
319+ beaker .BeakerEnvVar (name = "NCCL_TUNER_PLUGIN" , value = "libnccl-tuner.so" ),
320+ beaker .BeakerEnvVar (
319321 name = "NCCL_TUNER_CONFIG_PATH" , value = "/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto"
320322 ),
321- beaker .EnvVar (
323+ beaker .BeakerEnvVar (
322324 name = "NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE" ,
323325 value = "/var/lib/tcpxo/lib64/a3plus_guest_config.textproto" ,
324326 ),
325- beaker .EnvVar (name = "NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS" , value = "600000" ),
326- beaker .EnvVar (name = "NCCL_NVLS_ENABLE" , value = "0" ),
327- beaker .EnvVar (name = "NCCL_FASTRAK_CTRL_DEV" , value = "enp0s12" ),
328- beaker .EnvVar (
327+ beaker .BeakerEnvVar (name = "NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS" , value = "600000" ),
328+ beaker .BeakerEnvVar (name = "NCCL_NVLS_ENABLE" , value = "0" ),
329+ beaker .BeakerEnvVar (name = "NCCL_FASTRAK_CTRL_DEV" , value = "enp0s12" ),
330+ beaker .BeakerEnvVar (
329331 name = "NCCL_FASTRAK_IFNAME" ,
330332 value = "enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0" ,
331333 ),
332- beaker .EnvVar (name = "NCCL_SOCKET_IFNAME" , value = "enp0s12" ),
333- beaker .EnvVar (name = "NCCL_USE_SNAP" , value = "1" ),
334- beaker .EnvVar (name = "NCCL_FASTRAK_USE_LLCM" , value = "1" ),
335- beaker .EnvVar (name = "NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY" , value = "/dev/aperture_devices" ),
334+ beaker .BeakerEnvVar (name = "NCCL_SOCKET_IFNAME" , value = "enp0s12" ),
335+ beaker .BeakerEnvVar (name = "NCCL_USE_SNAP" , value = "1" ),
336+ beaker .BeakerEnvVar (name = "NCCL_FASTRAK_USE_LLCM" , value = "1" ),
337+ beaker .BeakerEnvVar (name = "NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY" , value = "/dev/aperture_devices" ),
336338 ]
337339 )
338340 # don't mount anything; assume no cache
@@ -342,8 +344,8 @@ def get_env_vars(
342344 if resumable :
343345 env_vars .extend (
344346 [
345- beaker .EnvVar (name = "WANDB_RUN_ID" , value = global_wandb_id ),
346- beaker .EnvVar (name = "WANDB_RESUME" , value = "allow" ),
347+ beaker .BeakerEnvVar (name = "WANDB_RUN_ID" , value = global_wandb_id ),
348+ beaker .BeakerEnvVar (name = "WANDB_RESUME" , value = "allow" ),
347349 ]
348350 )
349351
@@ -356,16 +358,22 @@ def get_datasets(beaker_datasets, cluster: List[str]):
356358 # if all cluster is in weka, we mount the weka
357359 if all (c in WEKA_CLUSTERS for c in cluster ):
358360 res = [
359- beaker .DataMount (source = beaker .DataSource (weka = "oe-adapt-default" ), mount_path = "/weka/oe-adapt-default" ),
360- beaker .DataMount (
361- source = beaker .DataSource (weka = "oe-training-default" ), mount_path = "/weka/oe-training-default"
361+ beaker .BeakerDataMount (
362+ source = beaker .BeakerDataSource (weka = "oe-adapt-default" ), mount_path = "/weka/oe-adapt-default"
363+ ),
364+ beaker .BeakerDataMount (
365+ source = beaker .BeakerDataSource (weka = "oe-training-default" ), mount_path = "/weka/oe-training-default"
362366 ),
363367 ]
364368 elif all (c in GCP_CLUSTERS for c in cluster ):
365- res = [beaker .DataMount (source = beaker .DataSource (host_path = "/mnt/filestore_1" ), mount_path = "/filestore" )]
369+ res = [
370+ beaker .BeakerDataMount (
371+ source = beaker .BeakerDataSource (host_path = "/mnt/filestore_1" ), mount_path = "/filestore"
372+ )
373+ ]
366374 for beaker_dataset in beaker_datasets :
367- to_append = beaker .DataMount (
368- source = beaker .DataSource (beaker = beaker_dataset ["beaker" ]), mount_path = beaker_dataset ["mount_path" ]
375+ to_append = beaker .BeakerDataMount (
376+ source = beaker .BeakerDataSource (beaker = beaker_dataset ["beaker" ]), mount_path = beaker_dataset ["mount_path" ]
369377 )
370378 res .append (to_append )
371379
@@ -716,17 +724,19 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami:
716724 raise ValueError ("GCP clusters do not have the dev filesystem, please use a proper image" )
717725
718726 if args .hostname is not None :
719- constraints = beaker .Constraints (hostname = args .hostname )
727+ constraints = beaker .BeakerConstraints (hostname = args .hostname )
720728 else :
721- constraints = beaker .Constraints (cluster = args .cluster )
722- spec = beaker .TaskSpec (
729+ constraints = beaker .BeakerConstraints (cluster = args .cluster )
730+ spec = beaker .BeakerTaskSpec (
723731 name = f"{ args .task_name } __{ i } " ,
724- image = beaker .ImageSource (beaker = args .image ),
732+ image = beaker .BeakerImageSource (beaker = args .image ),
725733 command = ["/bin/bash" , "-c" ],
726734 arguments = [full_command ],
727- result = beaker .ResultSpec (path = "/output" ),
735+ result = beaker .BeakerResultSpec (path = "/output" ),
728736 datasets = get_datasets (args .beaker_datasets , args .cluster ),
729- context = beaker .TaskContext (priority = beaker .Priority (args .priority ), preemptible = args .preemptible ),
737+ context = beaker .BeakerTaskContext (
738+ priority = beaker .BeakerJobPriority [args .priority ], preemptible = args .preemptible
739+ ),
730740 constraints = constraints ,
731741 env_vars = get_env_vars (
732742 args .pure_docker_mode ,
@@ -738,7 +748,7 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami:
738748 args .env ,
739749 args .secret ,
740750 ),
741- resources = beaker .TaskResources (gpu_count = args .gpus , shared_memory = args .shared_memory ),
751+ resources = beaker .BeakerTaskResources (gpu_count = args .gpus , shared_memory = args .shared_memory ),
742752 replicas = args .num_nodes ,
743753 )
744754 if args .num_nodes > 1 :
@@ -769,8 +779,8 @@ def main():
769779 beaker_client = beaker .Beaker .from_env (default_workspace = args .workspace )
770780 else :
771781 beaker_client = beaker .Beaker .from_env ()
772- beaker_secrets = [secret .name for secret in beaker_client .workspace . secrets ()]
773- whoami = beaker_client .account . whoami ().name
782+ beaker_secrets = [secret .name for secret in beaker_client .secret . list ()]
783+ whoami = beaker_client .user . get ().name
774784
775785 full_commands = [make_internal_command (command , args , whoami , is_external_user ) for command in commands ]
776786 if is_external_user :
@@ -789,17 +799,17 @@ def main():
789799 console .print (Text (full_command ))
790800 if is_external_user :
791801 return
792- experiment_spec = beaker .ExperimentSpec (
802+ experiment_spec = beaker .BeakerExperimentSpec (
793803 description = args .description ,
794804 tasks = [
795805 make_task_spec (args , full_command , i , beaker_secrets , whoami , args .resumable )
796806 for i , full_command in enumerate (full_commands )
797807 ],
798808 budget = args .budget ,
799- retry = beaker .RetrySpec (allowed_task_retries = args .max_retries ),
809+ retry = beaker .BeakerRetrySpec (allowed_task_retries = args .max_retries ),
800810 )
801811 exp = beaker_client .experiment .create (spec = experiment_spec )
802- console .log (f"Kicked off Beaker job. https://beaker.org/ex/{ exp .id } " )
812+ console .log (f"Kicked off Beaker job. https://beaker.org/ex/{ exp .experiment . id } " )
803813
804814
805815if __name__ == "__main__" :
0 commit comments