facebookresearch · luccabb · Oct 6, 2025 · Oct 4, 2025 · Oct 4, 2025 · Oct 6, 2025
@@ -29,3 +29,4 @@ repos:
       args: ["--install-types", "--non-interactive"]
       additional_dependencies:
         - types-click
+        - click-option-group
@@ -5,9 +5,10 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 import json
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 import click
+from click_option_group import optgroup, RequiredMutuallyExclusiveOptionGroup
 
 from clusterscope.cluster_info import AWSClusterInfo, UnifiedInfo
 from clusterscope.validate import job_gen_task_slurm_validator
@@ -179,21 +180,7 @@ def task():
     pass
 
 
-@task.command()
-@click.option(
-    "--gpus-per-task",
-    "gpus_per_task",
-    default=0,
-    type=click.IntRange(min=0),
-    help="Number of GPUs per task to request",
-)
-@click.option(
-    "--cpus-per-task",
-    "cpus_per_task",
-    default=0,
-    type=click.IntRange(min=0),
-    help="Number of CPUs per task to request",
-)
+@task.command()  # type: ignore[arg-type]
 @click.option("--partition", type=str, required=True, help="Partition to query")
 @click.option(
     "--tasks-per-node",
@@ -208,12 +195,29 @@ def task():
     default="json",
     help="Format to output the job requirements in",
 )
+@optgroup.group(
+    "GPU or CPU Job Request",
+    cls=RequiredMutuallyExclusiveOptionGroup,
+    help="Only one of --gpus-per-task or --cpus-per-task can be specified. For GPU requests, use --gpus-per-task and cpus-per-task will be generated automatically. For CPU requests, use --cpus-per-task.",
+)
+@optgroup.option(
+    "--gpus-per-task",
+    default=None,
+    type=click.IntRange(min=1),
+    help="Number of GPUs per task to request",
+)
+@optgroup.option(  # type: ignore[arg-type]
+    "--cpus-per-task",
+    default=None,
+    type=click.IntRange(min=1),
+    help="Number of CPUs per task to request",
+)
 def slurm(
-    gpus_per_task: int,
-    cpus_per_task: int,
     tasks_per_node: int,
     output_format: str,
     partition: str,
+    gpus_per_task: Optional[int],
+    cpus_per_task: Optional[int],
 ):
     """Generate job requirements for a task of a Slurm job based on GPU or CPU per task requirements."""
     job_gen_task_slurm_validator(

@@ -23,10 +23,10 @@ class ResourceShape(NamedTuple):
     """Represents resource requirements for a job in Slurm SBATCH format."""
 
     cpus_per_task: int
-    gpus_per_task: int
     memory: str
     tasks_per_node: int
     slurm_partition: str
+    gpus_per_task: Optional[int] = None
 
     def to_dict(self) -> dict:
         data = {k: v for k, v in self._asdict().items() if v is not None}
@@ -56,7 +56,7 @@ def to_sbatch(self) -> str:
             f"#SBATCH --ntasks-per-node={self.tasks_per_node}",
             f"#SBATCH --partition={self.slurm_partition}",
         ]
-        if self.gpus_per_task > 0:
+        if self.gpus_per_task and self.gpus_per_task > 0:
             lines.append(f"#SBATCH --gpus-per-task={self.gpus_per_task}")
         return "\n".join(lines)
 
@@ -73,7 +73,7 @@ def to_srun(self) -> str:
             f"--ntasks-per-node={self.tasks_per_node}",
             f"--partition={self.slurm_partition}",
         ]
-        if self.gpus_per_task > 0:
+        if self.gpus_per_task and self.gpus_per_task > 0:
             cmd_parts.append(f"--gpus-per-task={self.gpus_per_task}")
         return " ".join(cmd_parts)
 
@@ -90,7 +90,7 @@ def to_salloc(self) -> str:
             f"--ntasks-per-node={self.tasks_per_node}",
             f"--partition={self.slurm_partition}",
         ]
-        if self.gpus_per_task > 0:
+        if self.gpus_per_task and self.gpus_per_task > 0:
             cmd_parts.append(f"--gpus-per-task={self.gpus_per_task}")
         return " ".join(cmd_parts)
 
@@ -110,7 +110,7 @@ def to_submitit(self) -> str:
             "slurm_partition",
             "tasks_per_node",
         ]
-        if self.gpus_per_task > 0:
+        if self.gpus_per_task and self.gpus_per_task > 0:
             attrs.append("gpus_per_task")
         for attr_name in attrs:
             value = getattr(self, attr_name)
@@ -267,9 +267,9 @@ def get_total_gpus_per_node(self) -> int:
     def get_task_resource_requirements(
         self,
         partition: str,
-        gpus_per_task: int,
+        cpus_per_task: Optional[int] = None,
+        gpus_per_task: Optional[int] = None,
         tasks_per_node: int = 1,
-        cpus_per_task: int = 0,
     ) -> ResourceShape:
         """Calculate resource requirements for better GPU packing based on node's GPU configuration.
 
@@ -281,30 +281,32 @@ def get_task_resource_requirements(
         per-array-element resource allocation.
 
         Args:
-            gpus_per_task (int): Total number of GPUs required per task (1 to max available)
-            cpus_per_task (int): Total number of CPUs required per task (1 to max available)
+            (mutually exclusive, at least 1 required):
+                cpus_per_task (int): Total number of CPUs required per task (1 to max available)
+                gpus_per_task (int): Total number of GPUs required per task (1 to max available)
             tasks_per_node (int): Number of tasks to run per node (default: 1)
 
         Returns:
             ResourceShape: Tuple containing CPU cores per task (int), memory per node (str),
                           and tasks per node (int)
         """
-        assert not (gpus_per_task is None and cpus_per_task is None)
+        assert not (
+            gpus_per_task is None and cpus_per_task is None
+        ), "gpus_per_task, and cpus_per_task are mutually exclusive, at least 1 required"
         if tasks_per_node < 1:
             raise ValueError("tasks_per_node must be at least 1")
 
         total_cpus_per_node = self.get_cpus_per_node()
         total_ram_per_node = self.get_mem_per_node_MB()
 
         # CPU Request
-        if gpus_per_task == 0:
-
+        if cpus_per_task is not None:
             ram_mb_per_cpu = total_ram_per_node / total_cpus_per_node
             total_required_ram_mb = math.floor(
                 ram_mb_per_cpu * cpus_per_task * tasks_per_node
             )
         # GPU Request
-        else:
+        elif gpus_per_task is not None:
             total_gpus_per_node = self.get_total_gpus_per_node()
 
             cpu_cores_per_gpu = total_cpus_per_node / total_gpus_per_node
@@ -320,6 +322,10 @@ def get_task_resource_requirements(
             cpu_cores_per_task = total_required_cpu_cores_per_task / tasks_per_node
 
             cpus_per_task = math.floor(cpu_cores_per_task)
+        else:
+            raise ValueError(
+                "gpus_per_task, and cpus_per_task are mutually exclusive, at least 1 required."
+            )
 
         # Memory per node: Convert MB to GB and format for Slurm
         # Note: Memory is allocated per node, not per task in most Slurm configurations

@@ -1,17 +1,19 @@
 import logging
 import sys
+from typing import Optional
 
 from clusterscope.slurm.partition import get_partition_info
 
 
 def job_gen_task_slurm_validator(
     partition: str,
-    gpus_per_task: int,
-    cpus_per_task: int,
+    gpus_per_task: Optional[int],
+    cpus_per_task: Optional[int],
     tasks_per_node: int,
     exit_on_error: bool = False,
 ) -> None:
     """Validate the job requirements for a task of a Slurm job based on GPU or CPU per task requirements.
+    This validation is used for CLI and API calls.
 
     Returns: None
 
@@ -22,12 +24,12 @@ def job_gen_task_slurm_validator(
             logging.error("Either gpus_per_task or cpus_per_task must be specified.")
             sys.exit(1)
         raise ValueError("Either gpus_per_task or cpus_per_task must be specified.")
-    if cpus_per_task < 0:
+    if cpus_per_task and cpus_per_task < 0:
         if exit_on_error:
             logging.error("cpus_per_task has to be >= 0.")
             sys.exit(1)
         raise ValueError("cpus_per_task has to be >= 0.")
-    if gpus_per_task < 0:
+    if gpus_per_task and gpus_per_task < 0:
         if exit_on_error:
             logging.error("gpus_per_task has to be >= 0.")
             sys.exit(1)
@@ -61,7 +63,10 @@ def job_gen_task_slurm_validator(
         )
 
     # reject if requires more GPUs than the max GPUs per node for the partition
-    if gpus_per_task * tasks_per_node > req_partition.max_gpus_per_node:
+    if (
+        gpus_per_task
+        and gpus_per_task * tasks_per_node > req_partition.max_gpus_per_node
+    ):
         if exit_on_error:
             logging.error(
                 f"Requested {gpus_per_task=} GPUs with {tasks_per_node=} exceeds the maximum {req_partition.max_gpus_per_node} GPUs per node available in partition '{partition}'"
@@ -72,7 +77,10 @@ def job_gen_task_slurm_validator(
         )
 
     # reject if requires more CPUs than the max CPUs at the partition
-    if cpus_per_task * tasks_per_node > req_partition.max_cpus_per_node:
+    if (
+        cpus_per_task
+        and cpus_per_task * tasks_per_node > req_partition.max_cpus_per_node
+    ):
         if exit_on_error:
             logging.error(
                 f"Requested {cpus_per_task=} CPUs with {tasks_per_node=} exceeds the maximum {req_partition.max_cpus_per_node} CPUs per node available in partition '{partition}'"

@@ -14,9 +14,12 @@ click==8.2.1
     # via
     #   clusterscope (pyproject.toml)
     #   black
+    #   click-option-group
     #   moreorless
     #   ufmt
     #   usort
+click-option-group==0.5.8
+    # via clusterscope (pyproject.toml)
 distlib==0.4.0
     # via virtualenv
 filelock==3.19.1

@@ -21,6 +21,7 @@ authors = [
 ]
 dependencies = [
     "click>=8.0.0, !=8.3.0",
+    "click-option-group",
 ]
 
 [project.scripts]

@@ -1,4 +1,8 @@
 # This file was autogenerated by uv via the following command:
 #    uv pip compile pyproject.toml -o requirements.txt
 click==8.2.1
+    # via
+    #   clusterscope (pyproject.toml)
+    #   click-option-group
+click-option-group==0.5.8
     # via clusterscope (pyproject.toml)