Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 10 additions & 21 deletions clusterscope/cluster_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,27 +297,26 @@ def get_task_resource_requirements(
if gpus_per_task == 0:

ram_mb_per_cpu = total_ram_per_node / total_cpus_per_node
total_required_ram_mb = math.ceil(
total_required_ram_mb = math.floor(
ram_mb_per_cpu * cpus_per_task * tasks_per_node
)
# GPU Request
else:
total_gpus_per_node = self.get_total_gpus_per_node()

cpu_cores_per_gpu = total_cpus_per_node / total_gpus_per_node
total_required_cpu_cores_per_task = math.ceil(
total_required_cpu_cores_per_task = math.floor(
cpu_cores_per_gpu * gpus_per_task
)

ram_mb_per_gpu = total_ram_per_node / total_gpus_per_node
total_required_ram_mb = math.ceil(
total_required_ram_mb = math.floor(
ram_mb_per_gpu * gpus_per_task * tasks_per_node
)

cpu_cores_per_task = total_required_cpu_cores_per_task / tasks_per_node

# CPU cores per task: Round up to ensure we don't under-allocate
cpus_per_task = math.ceil(cpu_cores_per_task)
cpus_per_task = math.floor(cpu_cores_per_task)

# Memory per node: Convert MB to GB and format for Slurm
# Note: Memory is allocated per node, not per task in most Slurm configurations
Expand Down Expand Up @@ -366,34 +365,24 @@ def get_array_job_requirements(

if gpus_per_task == max_gpus_per_node:
# For max GPUs, use all available resources
required_cpu_cores = total_cpu_cores
required_ram_mb = total_ram_mb
required_cpu_cores = math.floor(total_cpu_cores)
required_ram_mb = math.floor(total_ram_mb)
else:
# Calculate per-GPU allocation based on actual GPU count per node
cpu_cores_per_gpu = total_cpu_cores / max_gpus_per_node
ram_mb_per_gpu = total_ram_mb / max_gpus_per_node

# Calculate requirements per array element
required_cpu_cores = math.ceil(cpu_cores_per_gpu * gpus_per_task)
required_ram_mb = math.ceil(ram_mb_per_gpu * gpus_per_task)

# Convert to Slurm SBATCH format
# CPU cores: Round up to ensure we don't under-allocate
sbatch_cpu_cores = math.ceil(required_cpu_cores)
required_cpu_cores = math.floor(cpu_cores_per_gpu * gpus_per_task)
required_ram_mb = math.floor(ram_mb_per_gpu * gpus_per_task)

# Memory: Convert MB to GB and format for Slurm
required_ram_gb = required_ram_mb / 1024
if required_ram_gb >= 1024:
# Use TB format for very large memory
sbatch_memory = f"{required_ram_gb / 1024:.0f}T"
else:
# Use GB format (most common)
sbatch_memory = f"{required_ram_gb:.0f}G"
sbatch_memory = f"{required_ram_gb:.0f}G"

# Array jobs always have 1 task per array element
return ResourceShape(
slurm_partition=partition,
cpus_per_task=sbatch_cpu_cores,
cpus_per_task=required_cpu_cores,
memory=sbatch_memory,
tasks_per_node=1,
gpus_per_task=gpus_per_task,
Expand Down
8 changes: 3 additions & 5 deletions tests/test_cluster_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,8 +888,8 @@ def test_get_task_resource_requirements_cpu_rounding_up(
partition="test_partition", gpus_per_task=1
)

# 191/8 = 23.875, should round up to 24
self.assertEqual(result.cpus_per_task, 24)
# 191/8 = 23.875, should round down to 23
self.assertEqual(result.cpus_per_task, 23)

@patch.object(UnifiedInfo, "get_total_gpus_per_node")
def test_getResRequirements_invalid_tasks_per_node(self, mock_total_gpus):
Expand Down Expand Up @@ -956,9 +956,7 @@ def test_getArrayJobRequirements_full_node(
)

self.assertEqual(result.cpus_per_task, 192) # All CPUs
self.assertEqual(
result.memory, "2T"
) # All memory: 1843200/1024/1024 = 1.8TB rounds to 2T
self.assertEqual(result.memory, "1800G") # All memory: 1843200/1024 = 1800GB
self.assertEqual(result.tasks_per_node, 1)

@patch.object(UnifiedInfo, "get_total_gpus_per_node")
Expand Down