diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 430b075..30df6e6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,3 +29,4 @@ repos: args: ["--install-types", "--non-interactive"] additional_dependencies: - types-click + - click-option-group diff --git a/clusterscope/cli.py b/clusterscope/cli.py index c2a2683..8aaebb3 100644 --- a/clusterscope/cli.py +++ b/clusterscope/cli.py @@ -5,9 +5,10 @@ # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import json -from typing import Any, Dict +from typing import Any, Dict, Optional import click +from click_option_group import optgroup, RequiredMutuallyExclusiveOptionGroup from clusterscope.cluster_info import AWSClusterInfo, UnifiedInfo from clusterscope.validate import job_gen_task_slurm_validator @@ -179,21 +180,7 @@ def task(): pass -@task.command() -@click.option( - "--gpus-per-task", - "gpus_per_task", - default=0, - type=click.IntRange(min=0), - help="Number of GPUs per task to request", -) -@click.option( - "--cpus-per-task", - "cpus_per_task", - default=0, - type=click.IntRange(min=0), - help="Number of CPUs per task to request", -) +@task.command() # type: ignore[arg-type] @click.option("--partition", type=str, required=True, help="Partition to query") @click.option( "--tasks-per-node", @@ -208,12 +195,29 @@ def task(): default="json", help="Format to output the job requirements in", ) +@optgroup.group( + "GPU or CPU Job Request", + cls=RequiredMutuallyExclusiveOptionGroup, + help="Only one of --gpus-per-task or --cpus-per-task can be specified. For GPU requests, use --gpus-per-task and cpus-per-task will be generated automatically. For CPU requests, use --cpus-per-task.", +) +@optgroup.option( + "--gpus-per-task", + default=None, + type=click.IntRange(min=1), + help="Number of GPUs per task to request", +) +@optgroup.option( # type: ignore[arg-type] + "--cpus-per-task", + default=None, + type=click.IntRange(min=1), + help="Number of CPUs per task to request", +) def slurm( - gpus_per_task: int, - cpus_per_task: int, tasks_per_node: int, output_format: str, partition: str, + gpus_per_task: Optional[int], + cpus_per_task: Optional[int], ): """Generate job requirements for a task of a Slurm job based on GPU or CPU per task requirements.""" job_gen_task_slurm_validator( diff --git a/clusterscope/cluster_info.py b/clusterscope/cluster_info.py index 7400f71..59b3952 100644 --- a/clusterscope/cluster_info.py +++ b/clusterscope/cluster_info.py @@ -23,10 +23,10 @@ class ResourceShape(NamedTuple): """Represents resource requirements for a job in Slurm SBATCH format.""" cpus_per_task: int - gpus_per_task: int memory: str tasks_per_node: int slurm_partition: str + gpus_per_task: Optional[int] = None def to_dict(self) -> dict: data = {k: v for k, v in self._asdict().items() if v is not None} @@ -56,7 +56,7 @@ def to_sbatch(self) -> str: f"#SBATCH --ntasks-per-node={self.tasks_per_node}", f"#SBATCH --partition={self.slurm_partition}", ] - if self.gpus_per_task > 0: + if self.gpus_per_task and self.gpus_per_task > 0: lines.append(f"#SBATCH --gpus-per-task={self.gpus_per_task}") return "\n".join(lines) @@ -73,7 +73,7 @@ def to_srun(self) -> str: f"--ntasks-per-node={self.tasks_per_node}", f"--partition={self.slurm_partition}", ] - if self.gpus_per_task > 0: + if self.gpus_per_task and self.gpus_per_task > 0: cmd_parts.append(f"--gpus-per-task={self.gpus_per_task}") return " ".join(cmd_parts) @@ -90,7 +90,7 @@ def to_salloc(self) -> str: f"--ntasks-per-node={self.tasks_per_node}", f"--partition={self.slurm_partition}", ] - if self.gpus_per_task > 0: + if self.gpus_per_task and self.gpus_per_task > 0: cmd_parts.append(f"--gpus-per-task={self.gpus_per_task}") return " ".join(cmd_parts) @@ -110,7 +110,7 @@ def to_submitit(self) -> str: "slurm_partition", "tasks_per_node", ] - if self.gpus_per_task > 0: + if self.gpus_per_task and self.gpus_per_task > 0: attrs.append("gpus_per_task") for attr_name in attrs: value = getattr(self, attr_name) @@ -267,9 +267,9 @@ def get_total_gpus_per_node(self) -> int: def get_task_resource_requirements( self, partition: str, - gpus_per_task: int, + cpus_per_task: Optional[int] = None, + gpus_per_task: Optional[int] = None, tasks_per_node: int = 1, - cpus_per_task: int = 0, ) -> ResourceShape: """Calculate resource requirements for better GPU packing based on node's GPU configuration. @@ -281,15 +281,18 @@ def get_task_resource_requirements( per-array-element resource allocation. Args: - gpus_per_task (int): Total number of GPUs required per task (1 to max available) - cpus_per_task (int): Total number of CPUs required per task (1 to max available) + (mutually exclusive, at least 1 required): + cpus_per_task (int): Total number of CPUs required per task (1 to max available) + gpus_per_task (int): Total number of GPUs required per task (1 to max available) tasks_per_node (int): Number of tasks to run per node (default: 1) Returns: ResourceShape: Tuple containing CPU cores per task (int), memory per node (str), and tasks per node (int) """ - assert not (gpus_per_task is None and cpus_per_task is None) + assert not ( + gpus_per_task is None and cpus_per_task is None + ), "gpus_per_task, and cpus_per_task are mutually exclusive, at least 1 required" if tasks_per_node < 1: raise ValueError("tasks_per_node must be at least 1") @@ -297,14 +300,13 @@ def get_task_resource_requirements( total_ram_per_node = self.get_mem_per_node_MB() # CPU Request - if gpus_per_task == 0: - + if cpus_per_task is not None: ram_mb_per_cpu = total_ram_per_node / total_cpus_per_node total_required_ram_mb = math.floor( ram_mb_per_cpu * cpus_per_task * tasks_per_node ) # GPU Request - else: + elif gpus_per_task is not None: total_gpus_per_node = self.get_total_gpus_per_node() cpu_cores_per_gpu = total_cpus_per_node / total_gpus_per_node @@ -320,6 +322,10 @@ def get_task_resource_requirements( cpu_cores_per_task = total_required_cpu_cores_per_task / tasks_per_node cpus_per_task = math.floor(cpu_cores_per_task) + else: + raise ValueError( + "gpus_per_task, and cpus_per_task are mutually exclusive, at least 1 required." + ) # Memory per node: Convert MB to GB and format for Slurm # Note: Memory is allocated per node, not per task in most Slurm configurations diff --git a/clusterscope/validate.py b/clusterscope/validate.py index 0f2b618..6034433 100644 --- a/clusterscope/validate.py +++ b/clusterscope/validate.py @@ -1,17 +1,19 @@ import logging import sys +from typing import Optional from clusterscope.slurm.partition import get_partition_info def job_gen_task_slurm_validator( partition: str, - gpus_per_task: int, - cpus_per_task: int, + gpus_per_task: Optional[int], + cpus_per_task: Optional[int], tasks_per_node: int, exit_on_error: bool = False, ) -> None: """Validate the job requirements for a task of a Slurm job based on GPU or CPU per task requirements. + This validation is used for CLI and API calls. Returns: None @@ -22,12 +24,12 @@ def job_gen_task_slurm_validator( logging.error("Either gpus_per_task or cpus_per_task must be specified.") sys.exit(1) raise ValueError("Either gpus_per_task or cpus_per_task must be specified.") - if cpus_per_task < 0: + if cpus_per_task and cpus_per_task < 0: if exit_on_error: logging.error("cpus_per_task has to be >= 0.") sys.exit(1) raise ValueError("cpus_per_task has to be >= 0.") - if gpus_per_task < 0: + if gpus_per_task and gpus_per_task < 0: if exit_on_error: logging.error("gpus_per_task has to be >= 0.") sys.exit(1) @@ -61,7 +63,10 @@ def job_gen_task_slurm_validator( ) # reject if requires more GPUs than the max GPUs per node for the partition - if gpus_per_task * tasks_per_node > req_partition.max_gpus_per_node: + if ( + gpus_per_task + and gpus_per_task * tasks_per_node > req_partition.max_gpus_per_node + ): if exit_on_error: logging.error( f"Requested {gpus_per_task=} GPUs with {tasks_per_node=} exceeds the maximum {req_partition.max_gpus_per_node} GPUs per node available in partition '{partition}'" @@ -72,7 +77,10 @@ def job_gen_task_slurm_validator( ) # reject if requires more CPUs than the max CPUs at the partition - if cpus_per_task * tasks_per_node > req_partition.max_cpus_per_node: + if ( + cpus_per_task + and cpus_per_task * tasks_per_node > req_partition.max_cpus_per_node + ): if exit_on_error: logging.error( f"Requested {cpus_per_task=} CPUs with {tasks_per_node=} exceeds the maximum {req_partition.max_cpus_per_node} CPUs per node available in partition '{partition}'" diff --git a/dev-requirements.txt b/dev-requirements.txt index 752d11b..b5e4319 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -14,9 +14,12 @@ click==8.2.1 # via # clusterscope (pyproject.toml) # black + # click-option-group # moreorless # ufmt # usort +click-option-group==0.5.8 + # via clusterscope (pyproject.toml) distlib==0.4.0 # via virtualenv filelock==3.19.1 diff --git a/pyproject.toml b/pyproject.toml index d97d1c5..3fa178d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ authors = [ ] dependencies = [ "click>=8.0.0, !=8.3.0", + "click-option-group", ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 682bd60..2b060a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,8 @@ # This file was autogenerated by uv via the following command: # uv pip compile pyproject.toml -o requirements.txt click==8.2.1 + # via + # clusterscope (pyproject.toml) + # click-option-group +click-option-group==0.5.8 # via clusterscope (pyproject.toml)