diff --git a/clusterscope/cli.py b/clusterscope/cli.py index c9120d8..c2a2683 100644 --- a/clusterscope/cli.py +++ b/clusterscope/cli.py @@ -243,42 +243,6 @@ def slurm( click.echo(format_methods[output_format]()) -@job_gen.command() -@click.option( - "--gpus-per-task", type=int, required=True, help="Number of GPUs per task" -) -@click.option( - "--format", - "output_format", - type=click.Choice(["json", "sbatch", "srun", "submitit", "salloc"]), - default="json", - help="Format to output the job requirements in", -) -@click.option( - "--partition", - type=str, - default=None, - help="Slurm partition name to filter queries (optional)", -) -def array(gpus_per_task: int, output_format: str, partition: str): - """Generate job requirements for an array job.""" - unified_info = UnifiedInfo(partition=partition) - job_requirements = unified_info.get_array_job_requirements( - partition=partition, - gpus_per_task=gpus_per_task, - ) - - # Route to the correct format method based on CLI option - format_methods = { - "json": job_requirements.to_json, - "sbatch": job_requirements.to_sbatch, - "srun": job_requirements.to_srun, - "salloc": job_requirements.to_salloc, - "submitit": job_requirements.to_submitit, - } - click.echo(format_methods[output_format]()) - - def main(): """Main entry point for the Slurm information CLI.""" cli() diff --git a/clusterscope/cluster_info.py b/clusterscope/cluster_info.py index e16c6e3..7400f71 100644 --- a/clusterscope/cluster_info.py +++ b/clusterscope/cluster_info.py @@ -335,62 +335,6 @@ def get_task_resource_requirements( tasks_per_node=tasks_per_node, ) - def get_array_job_requirements( - self, partition: str, gpus_per_task: int - ) -> ResourceShape: - """Calculate resource requirements for array jobs with optimal GPU packing. - - For array jobs, each array element gets its own resource allocation. - This method calculates per-array-element resources based on proportional - allocation of node resources per GPU. For maximum GPUs, returns all - available node resources. - - Args: - gpus_per_task (int): Number of GPUs required per array task (1 to max available) - - Returns: - ResourceShape: Tuple containing CPU cores per array element (int), - memory per array element (str), and tasks_per_node=1 - e.g., ResourceShape(cpu_cores=24, memory="225G", tasks_per_node=1) - - Raises: - ValueError: If gpus_per_task is not between 1 and max available GPUs - """ - # Get the total number of GPUs available per node - max_gpus_per_node = self.get_total_gpus_per_node() - - if not (1 <= gpus_per_task <= max_gpus_per_node): - raise ValueError(f"gpus_per_task must be between 1 and {max_gpus_per_node}") - - # Get total resources per node - total_cpu_cores = self.get_cpus_per_node() - total_ram_mb = self.get_mem_per_node_MB() - - if gpus_per_task == max_gpus_per_node: - # For max GPUs, use all available resources - required_cpu_cores = math.floor(total_cpu_cores) - required_ram_mb = math.floor(total_ram_mb) - else: - # Calculate per-GPU allocation based on actual GPU count per node - cpu_cores_per_gpu = total_cpu_cores / max_gpus_per_node - ram_mb_per_gpu = total_ram_mb / max_gpus_per_node - - # Calculate requirements per array element - required_cpu_cores = math.floor(cpu_cores_per_gpu * gpus_per_task) - required_ram_mb = math.floor(ram_mb_per_gpu * gpus_per_task) - - # Memory: Convert MB to GB and format for Slurm - required_ram_gb = required_ram_mb / 1024 - sbatch_memory = f"{required_ram_gb:.0f}G" - - return ResourceShape( - slurm_partition=partition, - cpus_per_task=required_cpu_cores, - memory=sbatch_memory, - tasks_per_node=1, - gpus_per_task=gpus_per_task, - ) - class DarwinInfo: def get_cpu_count(self, timeout: int = 60) -> int: diff --git a/tests/test_cluster_info.py b/tests/test_cluster_info.py index 1cbb81f..b88a809 100644 --- a/tests/test_cluster_info.py +++ b/tests/test_cluster_info.py @@ -902,101 +902,6 @@ def test_getResRequirements_invalid_tasks_per_node(self, mock_total_gpus): ) self.assertIn("tasks_per_node must be at least 1", str(context.exception)) - @patch.object(UnifiedInfo, "get_total_gpus_per_node") - @patch.object(UnifiedInfo, "get_cpus_per_node") - @patch.object(UnifiedInfo, "get_mem_per_node_MB") - def test_getArrayJobRequirements_single_gpu( - self, mock_mem, mock_cpus, mock_total_gpus - ): - """Test getArrayJobRequirements with 1 GPU per task.""" - mock_total_gpus.return_value = 8 - mock_cpus.return_value = 192 - mock_mem.return_value = 1843200 - - result = self.unified_info.get_array_job_requirements( - partition="test_partition", gpus_per_task=1 - ) - - self.assertEqual(result.cpus_per_task, 24) # 192/8 = 24 - self.assertEqual(result.memory, "225G") # 1843200/8/1024 = 225GB - self.assertEqual(result.tasks_per_node, 1) # Always 1 for array jobs - - @patch.object(UnifiedInfo, "get_total_gpus_per_node") - @patch.object(UnifiedInfo, "get_cpus_per_node") - @patch.object(UnifiedInfo, "get_mem_per_node_MB") - def test_getArrayJobRequirements_multiple_gpus( - self, mock_mem, mock_cpus, mock_total_gpus - ): - """Test getArrayJobRequirements with multiple GPUs per task.""" - mock_total_gpus.return_value = 8 - mock_cpus.return_value = 192 - mock_mem.return_value = 1843200 - - result = self.unified_info.get_array_job_requirements( - partition="test_partition", gpus_per_task=4 - ) - - self.assertEqual(result.cpus_per_task, 96) # 192/8*4 = 96 - self.assertEqual(result.memory, "900G") # 1843200/8*4/1024 = 900GB - self.assertEqual(result.tasks_per_node, 1) - - @patch.object(UnifiedInfo, "get_total_gpus_per_node") - @patch.object(UnifiedInfo, "get_cpus_per_node") - @patch.object(UnifiedInfo, "get_mem_per_node_MB") - def test_getArrayJobRequirements_full_node( - self, mock_mem, mock_cpus, mock_total_gpus - ): - """Test getArrayJobRequirements with all GPUs (full node per task).""" - mock_total_gpus.return_value = 8 - mock_cpus.return_value = 192 - mock_mem.return_value = 1843200 - - result = self.unified_info.get_array_job_requirements( - partition="test_partition", gpus_per_task=8 - ) - - self.assertEqual(result.cpus_per_task, 192) # All CPUs - self.assertEqual(result.memory, "1800G") # All memory: 1843200/1024 = 1800GB - self.assertEqual(result.tasks_per_node, 1) - - @patch.object(UnifiedInfo, "get_total_gpus_per_node") - @patch.object(UnifiedInfo, "get_cpus_per_node") - @patch.object(UnifiedInfo, "get_mem_per_node_MB") - def test_getArrayJobRequirements_4gpu_node( - self, mock_mem, mock_cpus, mock_total_gpus - ): - """Test getArrayJobRequirements on a 4-GPU node configuration.""" - mock_total_gpus.return_value = 4 - mock_cpus.return_value = 64 - mock_mem.return_value = 524288 - - result = self.unified_info.get_array_job_requirements( - partition="test_partition", gpus_per_task=2 - ) - - self.assertEqual(result.cpus_per_task, 32) # 64/4*2 = 32 - self.assertEqual(result.memory, "256G") # 524288/4*2/1024 = 256GB - self.assertEqual(result.tasks_per_node, 1) - - @patch.object(UnifiedInfo, "get_total_gpus_per_node") - def test_getArrayJobRequirements_invalid_gpus_per_task(self, mock_total_gpus): - """Test getArrayJobRequirements raises ValueError for invalid gpus_per_task.""" - mock_total_gpus.return_value = 8 - - # Test zero GPUs - with self.assertRaises(ValueError) as context: - self.unified_info.get_array_job_requirements( - partition="test_partition", gpus_per_task=0 - ) - self.assertIn("gpus_per_task must be between 1 and 8", str(context.exception)) - - # Test more than max GPUs - with self.assertRaises(ValueError) as context: - self.unified_info.get_array_job_requirements( - partition="test_partition", gpus_per_task=9 - ) - self.assertIn("gpus_per_task must be between 1 and 8", str(context.exception)) - @patch.object(UnifiedInfo, "get_gpu_generation_and_count") def test_get_total_gpus_per_node_with_gpus(self, mock_gpu_info): """Test get_total_gpus_per_node with actual GPU detection."""