-
Notifications
You must be signed in to change notification settings - Fork 459
Updates beaker-py version to >2. #1021
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
33 commits
Select commit
Hold shift + click to select a range
cb94af5
Updates beaker-py version.
finbarrtimbers 5c8795d
Fixed secrets ref.
finbarrtimbers 169fe8a
Fixed whoami command
finbarrtimbers 662661b
Fixed experimentspec error
finbarrtimbers ea655b9
Fixed constraints reference.
finbarrtimbers 1c686af
Update mason.py for beaker-py v2 API changes
finbarrtimbers 26c8c2a
Fix beaker.TaskSpec to beaker.BeakerTaskSpec for v2 API
finbarrtimbers 0edd090
Fix remaining beaker v2 API changes
finbarrtimbers 7f8e4db
Fix BeakerPriority to BeakerJobPriority for v2 API
finbarrtimbers 5aee8fc
Fix BeakerJobPriority enum access to use string key
finbarrtimbers 1e46770
Fix experiment ID access for beaker v2 API
finbarrtimbers 73a4f3f
Fix beaker v2 API compatibility in utils.py and test_utils.py
finbarrtimbers c1a6768
Updated finetune script
finbarrtimbers 7e4cfb1
Now, finetune_8b.sh uses uv.
finbarrtimbers 50423b2
Updated finetune_8b.sh to work with build_image_and_launch.sh.
finbarrtimbers cdd2391
Updated code
finbarrtimbers abf1b45
added chat template to script.
finbarrtimbers 64fa692
changed priority
finbarrtimbers f20e13e
updated priority
finbarrtimbers 5bcda40
Updated scripts/train/debug/finetune.sh.
finbarrtimbers 8bdd718
added non-resumable flag.
finbarrtimbers 892f5a9
Set description for debug finetune script.
finbarrtimbers 92add3e
Actually set chat template name.
finbarrtimbers 1277e4e
Merge branch 'main' into update-beaker-py
finbarrtimbers 8371060
Updated env var name
finbarrtimbers c291c04
updated typo
finbarrtimbers 3f15ee7
Merge branch 'main' into update-beaker-py
finbarrtimbers 252b7a5
Updated code
finbarrtimbers 2ea42f9
Merge branch 'main' into update-beaker-py
finbarrtimbers 5bef970
Updated error handling for interactive
finbarrtimbers fd565de
Updated script
finbarrtimbers faeacc3
Refactored finetune.sh
finbarrtimbers f45806b
Merge branch 'main' into update-beaker-py
finbarrtimbers File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -159,8 +159,8 @@ def get_args(): | |
| ) | ||
| parser.add_argument( | ||
| "--timeout", | ||
| type=str, | ||
| help="Timeout for the Beaker task (e.g., '2h', '30m', '1d'). If not specified, no timeout is set.", | ||
| type=int, | ||
| help="Timeout for the Beaker task in seconds (e.g., 7200 for 2 hours). If not specified, no timeout is set.", | ||
| default=None, | ||
| ) | ||
| # Split up the mason args from the Python args. | ||
|
|
@@ -232,15 +232,15 @@ def get_env_vars( | |
| additional_env_var_names = {var["name"] for var in additional_env_vars} | ||
|
|
||
| env_vars = [ | ||
| beaker.EnvVar(name=name, value=value) | ||
| beaker.BeakerEnvVar(name=name, value=value) | ||
| for name, value in DEFAULT_ENV_VARS.items() | ||
| if name not in additional_env_var_names | ||
| ] | ||
|
|
||
| env_vars.extend([beaker.EnvVar(name=env_var["name"], value=env_var["value"]) for env_var in additional_env_vars]) | ||
| env_vars.extend([beaker.BeakerEnvVar(name=env_var["name"], value=env_var["value"]) for env_var in additional_env_vars]) | ||
|
|
||
| # add user-specific secrets | ||
| env_vars.extend([beaker.EnvVar(name=secret["name"], secret=secret["value"]) for secret in additional_secrets]) | ||
| env_vars.extend([beaker.BeakerEnvVar(name=secret["name"], secret=secret["value"]) for secret in additional_secrets]) | ||
|
|
||
| useful_secrets = [ | ||
| "HF_TOKEN", | ||
|
|
@@ -254,22 +254,24 @@ def get_env_vars( | |
| ] | ||
| for useful_secret in useful_secrets: | ||
| if f"{whoami}_{useful_secret}" in beaker_secrets: | ||
| env_vars.append(beaker.EnvVar(name=useful_secret, secret=f"{whoami}_{useful_secret}")) | ||
| env_vars.append(beaker.BeakerEnvVar(name=useful_secret, secret=f"{whoami}_{useful_secret}")) | ||
| elif useful_secret in beaker_secrets: | ||
| env_vars.append(beaker.EnvVar(name=useful_secret, secret=useful_secret)) | ||
| env_vars.append(beaker.BeakerEnvVar(name=useful_secret, secret=useful_secret)) | ||
|
|
||
| # use the user's PATH; including the conda / python PATH | ||
| if not pure_docker_mode: | ||
| env_vars.extend([beaker.EnvVar(name="PATH", value=os.getenv("PATH"))]) | ||
| env_vars.extend([beaker.BeakerEnvVar(name="PATH", value=os.getenv("PATH"))]) | ||
|
|
||
| # if all cluster is in weka, we mount the weka | ||
| if all(c in WEKA_CLUSTERS for c in cluster): | ||
| env_vars.extend( | ||
| [ | ||
| beaker.EnvVar(name="HF_HOME", value="/weka/oe-adapt-default/allennlp/.cache/huggingface"), | ||
| beaker.EnvVar(name="HF_DATASETS_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/huggingface"), | ||
| beaker.EnvVar(name="HF_HUB_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/hub"), | ||
| beaker.EnvVar( | ||
| beaker.BeakerEnvVar(name="HF_HOME", value="/weka/oe-adapt-default/allennlp/.cache/huggingface"), | ||
| beaker.BeakerEnvVar( | ||
| name="HF_DATASETS_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/huggingface" | ||
| ), | ||
| beaker.BeakerEnvVar(name="HF_HUB_CACHE", value="/weka/oe-adapt-default/allennlp/.cache/hub"), | ||
| beaker.BeakerEnvVar( | ||
| name="CHECKPOINT_OUTPUT_DIR", | ||
| value=f"/weka/oe-adapt-default/allennlp/deletable_checkpoint_states/{global_wandb_id}", | ||
| ), | ||
|
|
@@ -278,19 +280,19 @@ def get_env_vars( | |
| if num_nodes > 1: | ||
| env_vars.extend( | ||
| [ | ||
| beaker.EnvVar(name="NCCL_SOCKET_IFNAME", value="ib"), | ||
| beaker.EnvVar(name="NCCL_IB_HCA", value="^=mlx5_bond_0"), | ||
| beaker.BeakerEnvVar(name="NCCL_SOCKET_IFNAME", value="ib"), | ||
| beaker.BeakerEnvVar(name="NCCL_IB_HCA", value="^=mlx5_bond_0"), | ||
| ] | ||
| ) | ||
| # if all cluster is in gcp we add the following env | ||
|
|
||
| elif all(c in GCP_CLUSTERS for c in cluster): | ||
| env_vars.extend( | ||
| [ | ||
| beaker.EnvVar(name="HF_HOME", value="/filestore/.cache/huggingface"), | ||
| beaker.EnvVar(name="HF_DATASETS_CACHE", value="/filestore/.cache/huggingface"), | ||
| beaker.EnvVar(name="HF_HUB_CACHE", value="/filestore/.cache/hub"), | ||
| beaker.EnvVar( | ||
| beaker.BeakerEnvVar(name="HF_HOME", value="/filestore/.cache/huggingface"), | ||
| beaker.BeakerEnvVar(name="HF_DATASETS_CACHE", value="/filestore/.cache/huggingface"), | ||
| beaker.BeakerEnvVar(name="HF_HUB_CACHE", value="/filestore/.cache/hub"), | ||
| beaker.BeakerEnvVar( | ||
| name="HF_HUB_ENABLE_HF_TRANSFER", | ||
| value="0", # we disable it because GCP is weird on uploading to the hub | ||
| ), | ||
|
|
@@ -299,40 +301,40 @@ def get_env_vars( | |
| if num_nodes > 1: | ||
| env_vars.extend( | ||
| [ | ||
| beaker.EnvVar(name="LD_LIBRARY_PATH", value=r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}"), | ||
| beaker.EnvVar(name="NCCL_CROSS_NIC", value="0"), | ||
| beaker.EnvVar(name="NCCL_ALGO", value="Ring,Tree"), | ||
| beaker.EnvVar(name="NCCL_PROTO", value="Simple"), | ||
| beaker.EnvVar(name="NCCL_MIN_NCHANNELS", value="4"), | ||
| beaker.EnvVar(name="NCCL_P2P_NET_CHUNKSIZE", value="524288"), | ||
| beaker.EnvVar(name="NCCL_P2P_PCI_CHUNKSIZE", value="524288"), | ||
| beaker.EnvVar(name="NCCL_P2P_NVL_CHUNKSIZE", value="1048576"), | ||
| beaker.EnvVar(name="NCCL_FASTRAK_NUM_FLOWS", value="2"), | ||
| beaker.EnvVar(name="NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL", value="0"), | ||
| beaker.EnvVar(name="NCCL_BUFFSIZE", value="8388608"), | ||
| beaker.EnvVar(name="NCCL_FASTRAK_USE_SNAP", value="1"), | ||
| beaker.EnvVar(name="CUDA_VISIBLE_DEVICES", value="0,1,2,3,4,5,6,7"), | ||
| beaker.EnvVar(name="NCCL_NET_GDR_LEVEL", value="PIX"), | ||
| beaker.EnvVar(name="NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING", value="0"), | ||
| beaker.EnvVar(name="NCCL_TUNER_PLUGIN", value="libnccl-tuner.so"), | ||
| beaker.EnvVar( | ||
| beaker.BeakerEnvVar(name="LD_LIBRARY_PATH", value=r"/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}"), | ||
| beaker.BeakerEnvVar(name="NCCL_CROSS_NIC", value="0"), | ||
| beaker.BeakerEnvVar(name="NCCL_ALGO", value="Ring,Tree"), | ||
| beaker.BeakerEnvVar(name="NCCL_PROTO", value="Simple"), | ||
| beaker.BeakerEnvVar(name="NCCL_MIN_NCHANNELS", value="4"), | ||
| beaker.BeakerEnvVar(name="NCCL_P2P_NET_CHUNKSIZE", value="524288"), | ||
| beaker.BeakerEnvVar(name="NCCL_P2P_PCI_CHUNKSIZE", value="524288"), | ||
| beaker.BeakerEnvVar(name="NCCL_P2P_NVL_CHUNKSIZE", value="1048576"), | ||
| beaker.BeakerEnvVar(name="NCCL_FASTRAK_NUM_FLOWS", value="2"), | ||
| beaker.BeakerEnvVar(name="NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL", value="0"), | ||
| beaker.BeakerEnvVar(name="NCCL_BUFFSIZE", value="8388608"), | ||
| beaker.BeakerEnvVar(name="NCCL_FASTRAK_USE_SNAP", value="1"), | ||
| beaker.BeakerEnvVar(name="CUDA_VISIBLE_DEVICES", value="0,1,2,3,4,5,6,7"), | ||
| beaker.BeakerEnvVar(name="NCCL_NET_GDR_LEVEL", value="PIX"), | ||
| beaker.BeakerEnvVar(name="NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING", value="0"), | ||
| beaker.BeakerEnvVar(name="NCCL_TUNER_PLUGIN", value="libnccl-tuner.so"), | ||
| beaker.BeakerEnvVar( | ||
| name="NCCL_TUNER_CONFIG_PATH", value="/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto" | ||
| ), | ||
| beaker.EnvVar( | ||
| beaker.BeakerEnvVar( | ||
| name="NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE", | ||
| value="/var/lib/tcpxo/lib64/a3plus_guest_config.textproto", | ||
| ), | ||
| beaker.EnvVar(name="NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", value="600000"), | ||
| beaker.EnvVar(name="NCCL_NVLS_ENABLE", value="0"), | ||
| beaker.EnvVar(name="NCCL_FASTRAK_CTRL_DEV", value="enp0s12"), | ||
| beaker.EnvVar( | ||
| beaker.BeakerEnvVar(name="NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS", value="600000"), | ||
| beaker.BeakerEnvVar(name="NCCL_NVLS_ENABLE", value="0"), | ||
| beaker.BeakerEnvVar(name="NCCL_FASTRAK_CTRL_DEV", value="enp0s12"), | ||
| beaker.BeakerEnvVar( | ||
| name="NCCL_FASTRAK_IFNAME", | ||
| value="enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0", | ||
| ), | ||
| beaker.EnvVar(name="NCCL_SOCKET_IFNAME", value="enp0s12"), | ||
| beaker.EnvVar(name="NCCL_USE_SNAP", value="1"), | ||
| beaker.EnvVar(name="NCCL_FASTRAK_USE_LLCM", value="1"), | ||
| beaker.EnvVar(name="NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY", value="/dev/aperture_devices"), | ||
| beaker.BeakerEnvVar(name="NCCL_SOCKET_IFNAME", value="enp0s12"), | ||
| beaker.BeakerEnvVar(name="NCCL_USE_SNAP", value="1"), | ||
| beaker.BeakerEnvVar(name="NCCL_FASTRAK_USE_LLCM", value="1"), | ||
| beaker.BeakerEnvVar(name="NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY", value="/dev/aperture_devices"), | ||
| ] | ||
| ) | ||
| # don't mount anything; assume no cache | ||
|
|
@@ -342,8 +344,8 @@ def get_env_vars( | |
| if resumable: | ||
| env_vars.extend( | ||
| [ | ||
| beaker.EnvVar(name="WANDB_RUN_ID", value=global_wandb_id), | ||
| beaker.EnvVar(name="WANDB_RESUME", value="allow"), | ||
| beaker.BeakerEnvVar(name="WANDB_RUN_ID", value=global_wandb_id), | ||
| beaker.BeakerEnvVar(name="WANDB_RESUME", value="allow"), | ||
| ] | ||
| ) | ||
|
|
||
|
|
@@ -356,16 +358,22 @@ def get_datasets(beaker_datasets, cluster: List[str]): | |
| # if all cluster is in weka, we mount the weka | ||
| if all(c in WEKA_CLUSTERS for c in cluster): | ||
| res = [ | ||
| beaker.DataMount(source=beaker.DataSource(weka="oe-adapt-default"), mount_path="/weka/oe-adapt-default"), | ||
| beaker.DataMount( | ||
| source=beaker.DataSource(weka="oe-training-default"), mount_path="/weka/oe-training-default" | ||
| beaker.BeakerDataMount( | ||
| source=beaker.BeakerDataSource(weka="oe-adapt-default"), mount_path="/weka/oe-adapt-default" | ||
| ), | ||
| beaker.BeakerDataMount( | ||
| source=beaker.BeakerDataSource(weka="oe-training-default"), mount_path="/weka/oe-training-default" | ||
| ), | ||
| ] | ||
| elif all(c in GCP_CLUSTERS for c in cluster): | ||
| res = [beaker.DataMount(source=beaker.DataSource(host_path="/mnt/filestore_1"), mount_path="/filestore")] | ||
| res = [ | ||
| beaker.BeakerDataMount( | ||
| source=beaker.BeakerDataSource(host_path="/mnt/filestore_1"), mount_path="/filestore" | ||
| ) | ||
| ] | ||
| for beaker_dataset in beaker_datasets: | ||
| to_append = beaker.DataMount( | ||
| source=beaker.DataSource(beaker=beaker_dataset["beaker"]), mount_path=beaker_dataset["mount_path"] | ||
| to_append = beaker.BeakerDataMount( | ||
| source=beaker.BeakerDataSource(beaker=beaker_dataset["beaker"]), mount_path=beaker_dataset["mount_path"] | ||
| ) | ||
| res.append(to_append) | ||
|
|
||
|
|
@@ -716,17 +724,19 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: | |
| raise ValueError("GCP clusters do not have the dev filesystem, please use a proper image") | ||
|
|
||
| if args.hostname is not None: | ||
| constraints = beaker.Constraints(hostname=args.hostname) | ||
| constraints = beaker.BeakerConstraints(hostname=args.hostname) | ||
| else: | ||
| constraints = beaker.Constraints(cluster=args.cluster) | ||
| spec = beaker.TaskSpec( | ||
| constraints = beaker.BeakerConstraints(cluster=args.cluster) | ||
| spec = beaker.BeakerTaskSpec( | ||
| name=f"{args.task_name}__{i}", | ||
| image=beaker.ImageSource(beaker=args.image), | ||
| image=beaker.BeakerImageSource(beaker=args.image), | ||
| command=["/bin/bash", "-c"], | ||
| arguments=[full_command], | ||
| result=beaker.ResultSpec(path="/output"), | ||
| result=beaker.BeakerResultSpec(path="/output"), | ||
| datasets=get_datasets(args.beaker_datasets, args.cluster), | ||
| context=beaker.TaskContext(priority=beaker.Priority(args.priority), preemptible=args.preemptible), | ||
| context=beaker.BeakerTaskContext( | ||
| priority=beaker.BeakerJobPriority[args.priority], preemptible=args.preemptible | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bug: Enum Lookup Fails on Case MismatchThe change to |
||
| ), | ||
| constraints=constraints, | ||
| env_vars=get_env_vars( | ||
| args.pure_docker_mode, | ||
|
|
@@ -738,7 +748,7 @@ def make_task_spec(args, full_command: str, i: int, beaker_secrets: str, whoami: | |
| args.env, | ||
| args.secret, | ||
| ), | ||
| resources=beaker.TaskResources(gpu_count=args.gpus, shared_memory=args.shared_memory), | ||
| resources=beaker.BeakerTaskResources(gpu_count=args.gpus, shared_memory=args.shared_memory), | ||
| replicas=args.num_nodes, | ||
| ) | ||
| if args.num_nodes > 1: | ||
|
|
@@ -769,8 +779,8 @@ def main(): | |
| beaker_client = beaker.Beaker.from_env(default_workspace=args.workspace) | ||
| else: | ||
| beaker_client = beaker.Beaker.from_env() | ||
| beaker_secrets = [secret.name for secret in beaker_client.workspace.secrets()] | ||
| whoami = beaker_client.account.whoami().name | ||
| beaker_secrets = [secret.name for secret in beaker_client.secret.list()] | ||
| whoami = beaker_client.user.get().name | ||
|
|
||
| full_commands = [make_internal_command(command, args, whoami, is_external_user) for command in commands] | ||
| if is_external_user: | ||
|
|
@@ -789,17 +799,17 @@ def main(): | |
| console.print(Text(full_command)) | ||
| if is_external_user: | ||
| return | ||
| experiment_spec = beaker.ExperimentSpec( | ||
| experiment_spec = beaker.BeakerExperimentSpec( | ||
| description=args.description, | ||
| tasks=[ | ||
| make_task_spec(args, full_command, i, beaker_secrets, whoami, args.resumable) | ||
| for i, full_command in enumerate(full_commands) | ||
| ], | ||
| budget=args.budget, | ||
| retry=beaker.RetrySpec(allowed_task_retries=args.max_retries), | ||
| retry=beaker.BeakerRetrySpec(allowed_task_retries=args.max_retries), | ||
| ) | ||
| exp = beaker_client.experiment.create(spec=experiment_spec) | ||
| console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.id}") | ||
| console.log(f"Kicked off Beaker job. https://beaker.org/ex/{exp.experiment.id}") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.