Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,4 @@ repos:
args: ["--install-types", "--non-interactive"]
additional_dependencies:
- types-click
- click-option-group
40 changes: 22 additions & 18 deletions clusterscope/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import json
from typing import Any, Dict
from typing import Any, Dict, Optional

import click
from click_option_group import optgroup, RequiredMutuallyExclusiveOptionGroup

from clusterscope.cluster_info import AWSClusterInfo, UnifiedInfo
from clusterscope.validate import job_gen_task_slurm_validator
Expand Down Expand Up @@ -179,21 +180,7 @@ def task():
pass


@task.command()
@click.option(
"--gpus-per-task",
"gpus_per_task",
default=0,
type=click.IntRange(min=0),
help="Number of GPUs per task to request",
)
@click.option(
"--cpus-per-task",
"cpus_per_task",
default=0,
type=click.IntRange(min=0),
help="Number of CPUs per task to request",
)
@task.command() # type: ignore[arg-type]
@click.option("--partition", type=str, required=True, help="Partition to query")
@click.option(
"--tasks-per-node",
Expand All @@ -208,12 +195,29 @@ def task():
default="json",
help="Format to output the job requirements in",
)
@optgroup.group(
"GPU or CPU Job Request",
cls=RequiredMutuallyExclusiveOptionGroup,
help="Only one of --gpus-per-task or --cpus-per-task can be specified. For GPU requests, use --gpus-per-task and cpus-per-task will be generated automatically. For CPU requests, use --cpus-per-task.",
)
@optgroup.option(
"--gpus-per-task",
default=None,
type=click.IntRange(min=1),
help="Number of GPUs per task to request",
)
@optgroup.option( # type: ignore[arg-type]
"--cpus-per-task",
default=None,
type=click.IntRange(min=1),
help="Number of CPUs per task to request",
)
def slurm(
gpus_per_task: int,
cpus_per_task: int,
tasks_per_node: int,
output_format: str,
partition: str,
gpus_per_task: Optional[int],
cpus_per_task: Optional[int],
):
"""Generate job requirements for a task of a Slurm job based on GPU or CPU per task requirements."""
job_gen_task_slurm_validator(
Expand Down
32 changes: 19 additions & 13 deletions clusterscope/cluster_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ class ResourceShape(NamedTuple):
"""Represents resource requirements for a job in Slurm SBATCH format."""

cpus_per_task: int
gpus_per_task: int
memory: str
tasks_per_node: int
slurm_partition: str
gpus_per_task: Optional[int] = None

def to_dict(self) -> dict:
data = {k: v for k, v in self._asdict().items() if v is not None}
Expand Down Expand Up @@ -56,7 +56,7 @@ def to_sbatch(self) -> str:
f"#SBATCH --ntasks-per-node={self.tasks_per_node}",
f"#SBATCH --partition={self.slurm_partition}",
]
if self.gpus_per_task > 0:
if self.gpus_per_task and self.gpus_per_task > 0:
lines.append(f"#SBATCH --gpus-per-task={self.gpus_per_task}")
return "\n".join(lines)

Expand All @@ -73,7 +73,7 @@ def to_srun(self) -> str:
f"--ntasks-per-node={self.tasks_per_node}",
f"--partition={self.slurm_partition}",
]
if self.gpus_per_task > 0:
if self.gpus_per_task and self.gpus_per_task > 0:
cmd_parts.append(f"--gpus-per-task={self.gpus_per_task}")
return " ".join(cmd_parts)

Expand All @@ -90,7 +90,7 @@ def to_salloc(self) -> str:
f"--ntasks-per-node={self.tasks_per_node}",
f"--partition={self.slurm_partition}",
]
if self.gpus_per_task > 0:
if self.gpus_per_task and self.gpus_per_task > 0:
cmd_parts.append(f"--gpus-per-task={self.gpus_per_task}")
return " ".join(cmd_parts)

Expand All @@ -110,7 +110,7 @@ def to_submitit(self) -> str:
"slurm_partition",
"tasks_per_node",
]
if self.gpus_per_task > 0:
if self.gpus_per_task and self.gpus_per_task > 0:
attrs.append("gpus_per_task")
for attr_name in attrs:
value = getattr(self, attr_name)
Expand Down Expand Up @@ -267,9 +267,9 @@ def get_total_gpus_per_node(self) -> int:
def get_task_resource_requirements(
self,
partition: str,
gpus_per_task: int,
cpus_per_task: Optional[int] = None,
gpus_per_task: Optional[int] = None,
tasks_per_node: int = 1,
cpus_per_task: int = 0,
) -> ResourceShape:
"""Calculate resource requirements for better GPU packing based on node's GPU configuration.

Expand All @@ -281,30 +281,32 @@ def get_task_resource_requirements(
per-array-element resource allocation.

Args:
gpus_per_task (int): Total number of GPUs required per task (1 to max available)
cpus_per_task (int): Total number of CPUs required per task (1 to max available)
(mutually exclusive, at least 1 required):
cpus_per_task (int): Total number of CPUs required per task (1 to max available)
gpus_per_task (int): Total number of GPUs required per task (1 to max available)
tasks_per_node (int): Number of tasks to run per node (default: 1)

Returns:
ResourceShape: Tuple containing CPU cores per task (int), memory per node (str),
and tasks per node (int)
"""
assert not (gpus_per_task is None and cpus_per_task is None)
assert not (
gpus_per_task is None and cpus_per_task is None
), "gpus_per_task, and cpus_per_task are mutually exclusive, at least 1 required"
if tasks_per_node < 1:
raise ValueError("tasks_per_node must be at least 1")

total_cpus_per_node = self.get_cpus_per_node()
total_ram_per_node = self.get_mem_per_node_MB()

# CPU Request
if gpus_per_task == 0:

if cpus_per_task is not None:
ram_mb_per_cpu = total_ram_per_node / total_cpus_per_node
total_required_ram_mb = math.floor(
ram_mb_per_cpu * cpus_per_task * tasks_per_node
)
# GPU Request
else:
elif gpus_per_task is not None:
total_gpus_per_node = self.get_total_gpus_per_node()

cpu_cores_per_gpu = total_cpus_per_node / total_gpus_per_node
Expand All @@ -320,6 +322,10 @@ def get_task_resource_requirements(
cpu_cores_per_task = total_required_cpu_cores_per_task / tasks_per_node

cpus_per_task = math.floor(cpu_cores_per_task)
else:
raise ValueError(
"gpus_per_task, and cpus_per_task are mutually exclusive, at least 1 required."
)

# Memory per node: Convert MB to GB and format for Slurm
# Note: Memory is allocated per node, not per task in most Slurm configurations
Expand Down
20 changes: 14 additions & 6 deletions clusterscope/validate.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import logging
import sys
from typing import Optional

from clusterscope.slurm.partition import get_partition_info


def job_gen_task_slurm_validator(
partition: str,
gpus_per_task: int,
cpus_per_task: int,
gpus_per_task: Optional[int],
cpus_per_task: Optional[int],
tasks_per_node: int,
exit_on_error: bool = False,
) -> None:
"""Validate the job requirements for a task of a Slurm job based on GPU or CPU per task requirements.
This validation is used for CLI and API calls.

Returns: None

Expand All @@ -22,12 +24,12 @@ def job_gen_task_slurm_validator(
logging.error("Either gpus_per_task or cpus_per_task must be specified.")
sys.exit(1)
raise ValueError("Either gpus_per_task or cpus_per_task must be specified.")
if cpus_per_task < 0:
if cpus_per_task and cpus_per_task < 0:
if exit_on_error:
logging.error("cpus_per_task has to be >= 0.")
sys.exit(1)
raise ValueError("cpus_per_task has to be >= 0.")
if gpus_per_task < 0:
if gpus_per_task and gpus_per_task < 0:
if exit_on_error:
logging.error("gpus_per_task has to be >= 0.")
sys.exit(1)
Expand Down Expand Up @@ -61,7 +63,10 @@ def job_gen_task_slurm_validator(
)

# reject if requires more GPUs than the max GPUs per node for the partition
if gpus_per_task * tasks_per_node > req_partition.max_gpus_per_node:
if (
gpus_per_task
and gpus_per_task * tasks_per_node > req_partition.max_gpus_per_node
):
if exit_on_error:
logging.error(
f"Requested {gpus_per_task=} GPUs with {tasks_per_node=} exceeds the maximum {req_partition.max_gpus_per_node} GPUs per node available in partition '{partition}'"
Expand All @@ -72,7 +77,10 @@ def job_gen_task_slurm_validator(
)

# reject if requires more CPUs than the max CPUs at the partition
if cpus_per_task * tasks_per_node > req_partition.max_cpus_per_node:
if (
cpus_per_task
and cpus_per_task * tasks_per_node > req_partition.max_cpus_per_node
):
if exit_on_error:
logging.error(
f"Requested {cpus_per_task=} CPUs with {tasks_per_node=} exceeds the maximum {req_partition.max_cpus_per_node} CPUs per node available in partition '{partition}'"
Expand Down
3 changes: 3 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ click==8.2.1
# via
# clusterscope (pyproject.toml)
# black
# click-option-group
# moreorless
# ufmt
# usort
click-option-group==0.5.8
# via clusterscope (pyproject.toml)
distlib==0.4.0
# via virtualenv
filelock==3.19.1
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ authors = [
]
dependencies = [
"click>=8.0.0, !=8.3.0",
"click-option-group",
]

[project.scripts]
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml -o requirements.txt
click==8.2.1
# via
# clusterscope (pyproject.toml)
# click-option-group
click-option-group==0.5.8
# via clusterscope (pyproject.toml)