Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 0 additions & 36 deletions clusterscope/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,42 +243,6 @@ def slurm(
click.echo(format_methods[output_format]())


@job_gen.command()
@click.option(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this being removed? I didn't it. Thought we wanted to eliminate "array-job" related stuff.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these are the args being passed to the array cli . def array below

"--gpus-per-task", type=int, required=True, help="Number of GPUs per task"
)
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "sbatch", "srun", "submitit", "salloc"]),
default="json",
help="Format to output the job requirements in",
)
@click.option(
"--partition",
type=str,
default=None,
help="Slurm partition name to filter queries (optional)",
)
def array(gpus_per_task: int, output_format: str, partition: str):
"""Generate job requirements for an array job."""
unified_info = UnifiedInfo(partition=partition)
job_requirements = unified_info.get_array_job_requirements(
partition=partition,
gpus_per_task=gpus_per_task,
)

# Route to the correct format method based on CLI option
format_methods = {
"json": job_requirements.to_json,
"sbatch": job_requirements.to_sbatch,
"srun": job_requirements.to_srun,
"salloc": job_requirements.to_salloc,
"submitit": job_requirements.to_submitit,
}
click.echo(format_methods[output_format]())


def main():
"""Main entry point for the Slurm information CLI."""
cli()
Expand Down
56 changes: 0 additions & 56 deletions clusterscope/cluster_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,62 +335,6 @@ def get_task_resource_requirements(
tasks_per_node=tasks_per_node,
)

def get_array_job_requirements(
self, partition: str, gpus_per_task: int
) -> ResourceShape:
"""Calculate resource requirements for array jobs with optimal GPU packing.

For array jobs, each array element gets its own resource allocation.
This method calculates per-array-element resources based on proportional
allocation of node resources per GPU. For maximum GPUs, returns all
available node resources.

Args:
gpus_per_task (int): Number of GPUs required per array task (1 to max available)

Returns:
ResourceShape: Tuple containing CPU cores per array element (int),
memory per array element (str), and tasks_per_node=1
e.g., ResourceShape(cpu_cores=24, memory="225G", tasks_per_node=1)

Raises:
ValueError: If gpus_per_task is not between 1 and max available GPUs
"""
# Get the total number of GPUs available per node
max_gpus_per_node = self.get_total_gpus_per_node()

if not (1 <= gpus_per_task <= max_gpus_per_node):
raise ValueError(f"gpus_per_task must be between 1 and {max_gpus_per_node}")

# Get total resources per node
total_cpu_cores = self.get_cpus_per_node()
total_ram_mb = self.get_mem_per_node_MB()

if gpus_per_task == max_gpus_per_node:
# For max GPUs, use all available resources
required_cpu_cores = math.floor(total_cpu_cores)
required_ram_mb = math.floor(total_ram_mb)
else:
# Calculate per-GPU allocation based on actual GPU count per node
cpu_cores_per_gpu = total_cpu_cores / max_gpus_per_node
ram_mb_per_gpu = total_ram_mb / max_gpus_per_node

# Calculate requirements per array element
required_cpu_cores = math.floor(cpu_cores_per_gpu * gpus_per_task)
required_ram_mb = math.floor(ram_mb_per_gpu * gpus_per_task)

# Memory: Convert MB to GB and format for Slurm
required_ram_gb = required_ram_mb / 1024
sbatch_memory = f"{required_ram_gb:.0f}G"

return ResourceShape(
slurm_partition=partition,
cpus_per_task=required_cpu_cores,
memory=sbatch_memory,
tasks_per_node=1,
gpus_per_task=gpus_per_task,
)


class DarwinInfo:
def get_cpu_count(self, timeout: int = 60) -> int:
Expand Down
95 changes: 0 additions & 95 deletions tests/test_cluster_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,101 +902,6 @@ def test_getResRequirements_invalid_tasks_per_node(self, mock_total_gpus):
)
self.assertIn("tasks_per_node must be at least 1", str(context.exception))

@patch.object(UnifiedInfo, "get_total_gpus_per_node")
@patch.object(UnifiedInfo, "get_cpus_per_node")
@patch.object(UnifiedInfo, "get_mem_per_node_MB")
def test_getArrayJobRequirements_single_gpu(
self, mock_mem, mock_cpus, mock_total_gpus
):
"""Test getArrayJobRequirements with 1 GPU per task."""
mock_total_gpus.return_value = 8
mock_cpus.return_value = 192
mock_mem.return_value = 1843200

result = self.unified_info.get_array_job_requirements(
partition="test_partition", gpus_per_task=1
)

self.assertEqual(result.cpus_per_task, 24) # 192/8 = 24
self.assertEqual(result.memory, "225G") # 1843200/8/1024 = 225GB
self.assertEqual(result.tasks_per_node, 1) # Always 1 for array jobs

@patch.object(UnifiedInfo, "get_total_gpus_per_node")
@patch.object(UnifiedInfo, "get_cpus_per_node")
@patch.object(UnifiedInfo, "get_mem_per_node_MB")
def test_getArrayJobRequirements_multiple_gpus(
self, mock_mem, mock_cpus, mock_total_gpus
):
"""Test getArrayJobRequirements with multiple GPUs per task."""
mock_total_gpus.return_value = 8
mock_cpus.return_value = 192
mock_mem.return_value = 1843200

result = self.unified_info.get_array_job_requirements(
partition="test_partition", gpus_per_task=4
)

self.assertEqual(result.cpus_per_task, 96) # 192/8*4 = 96
self.assertEqual(result.memory, "900G") # 1843200/8*4/1024 = 900GB
self.assertEqual(result.tasks_per_node, 1)

@patch.object(UnifiedInfo, "get_total_gpus_per_node")
@patch.object(UnifiedInfo, "get_cpus_per_node")
@patch.object(UnifiedInfo, "get_mem_per_node_MB")
def test_getArrayJobRequirements_full_node(
self, mock_mem, mock_cpus, mock_total_gpus
):
"""Test getArrayJobRequirements with all GPUs (full node per task)."""
mock_total_gpus.return_value = 8
mock_cpus.return_value = 192
mock_mem.return_value = 1843200

result = self.unified_info.get_array_job_requirements(
partition="test_partition", gpus_per_task=8
)

self.assertEqual(result.cpus_per_task, 192) # All CPUs
self.assertEqual(result.memory, "1800G") # All memory: 1843200/1024 = 1800GB
self.assertEqual(result.tasks_per_node, 1)

@patch.object(UnifiedInfo, "get_total_gpus_per_node")
@patch.object(UnifiedInfo, "get_cpus_per_node")
@patch.object(UnifiedInfo, "get_mem_per_node_MB")
def test_getArrayJobRequirements_4gpu_node(
self, mock_mem, mock_cpus, mock_total_gpus
):
"""Test getArrayJobRequirements on a 4-GPU node configuration."""
mock_total_gpus.return_value = 4
mock_cpus.return_value = 64
mock_mem.return_value = 524288

result = self.unified_info.get_array_job_requirements(
partition="test_partition", gpus_per_task=2
)

self.assertEqual(result.cpus_per_task, 32) # 64/4*2 = 32
self.assertEqual(result.memory, "256G") # 524288/4*2/1024 = 256GB
self.assertEqual(result.tasks_per_node, 1)

@patch.object(UnifiedInfo, "get_total_gpus_per_node")
def test_getArrayJobRequirements_invalid_gpus_per_task(self, mock_total_gpus):
"""Test getArrayJobRequirements raises ValueError for invalid gpus_per_task."""
mock_total_gpus.return_value = 8

# Test zero GPUs
with self.assertRaises(ValueError) as context:
self.unified_info.get_array_job_requirements(
partition="test_partition", gpus_per_task=0
)
self.assertIn("gpus_per_task must be between 1 and 8", str(context.exception))

# Test more than max GPUs
with self.assertRaises(ValueError) as context:
self.unified_info.get_array_job_requirements(
partition="test_partition", gpus_per_task=9
)
self.assertIn("gpus_per_task must be between 1 and 8", str(context.exception))

@patch.object(UnifiedInfo, "get_gpu_generation_and_count")
def test_get_total_gpus_per_node_with_gpus(self, mock_gpu_info):
"""Test get_total_gpus_per_node with actual GPU detection."""
Expand Down