From 7f19f51e6436fdeeeb5394def56e3dbedd82d9a7 Mon Sep 17 00:00:00 2001 From: Daniel Werner Date: Sat, 2 Aug 2025 21:11:47 +0200 Subject: [PATCH 1/3] Fix that all slurm jobs were canceled if a single job failed and add regression test --- .github/workflows/ci.yml | 2 +- .../schedulers/cluster_executor.py | 6 +-- cluster_tools/tests/test_slurm.py | 50 +++++++++++++------ 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 137883dda..dca946c8a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: - name: Restart Slurm Cluster run: cd ./dockered-slurm && docker compose restart slurmctld c1 c2 - - name: "Run Tests (test_all, test_slurm) with modified slurn.conf" + - name: "Run Tests (test_all, test_slurm) with modified slurm.conf" run: | # Run tests requiring a modified slurm config docker exec \ diff --git a/cluster_tools/cluster_tools/schedulers/cluster_executor.py b/cluster_tools/cluster_tools/schedulers/cluster_executor.py index 75c498656..01a022ef2 100644 --- a/cluster_tools/cluster_tools/schedulers/cluster_executor.py +++ b/cluster_tools/cluster_tools/schedulers/cluster_executor.py @@ -630,15 +630,15 @@ def register_jobs( # Overwrite the context manager __exit as it doesn't forward the information whether an exception was thrown or not otherwise # which may lead to a deadlock if an exception is thrown within a cluster executor with statement, because self.jobs_empty_cond.wait() - # never succeeds. + # never succeeds. However, don't fail hard if a RemoteException occurred as we want the other scheduled jobs to keep running. def __exit__( self, exc_type: type[BaseException] | None, _exc_val: BaseException | None, _exc_tb: TracebackType | None, ) -> Literal[False]: - # Don't wait if an exception was thrown - self.shutdown(wait=exc_type is None) + # Don't wait if an exception other than a RemoteException was thrown + self.shutdown(wait=exc_type is None or issubclass(exc_type, RemoteException)) return False def shutdown(self, wait: bool = True, cancel_futures: bool = True) -> None: diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index 47162fef9..ffb6198c7 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -374,7 +374,11 @@ def test_slurm_time_limit() -> None: @pytest.mark.requires_modified_slurm_config -def test_slurm_memory_limit() -> None: +def test_slurm_memory_limit( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("SLURM_MAX_RUNNING_SIZE", "2") + # Request 30 MB executor = cluster_tools.get_executor( "slurm", @@ -382,20 +386,38 @@ def test_slurm_memory_limit() -> None: job_resources={"mem": "30M"}, # 30M is the smallest limit enforced by Cgroups ) - with executor: - # Schedule a job that allocates more than 30 MB and let it run for more than 1 second - # because the frequency of the memory polling is 1 second - duration = 3 - futures = executor.map_to_futures( - partial(allocate, duration), [1024 * 1024 * 50] - ) - concurrent.futures.wait(futures) - - # Job should have been killed with a RemoteOutOfMemoryException - assert all( - isinstance(fut.exception(), cluster_tools.RemoteOutOfMemoryException) - for fut in futures + job_ids = [] + with pytest.raises( + ( + cluster_tools.RemoteOutOfMemoryException, + cluster_tools.schedulers.cluster_executor.RemoteException, ) + ): + with executor: + # Schedule a job that allocates more than 30 MB and let it run for more than 1 second + # because the frequency of the memory polling is 1 second + duration = 3 + futures = executor.map_to_futures( + partial(allocate, duration), [1024 * 1024 * 50, 10, 20, 30] + ) + for future in futures: + future.add_done_callback( + lambda f: job_ids.append(f"{f.cluster_jobid}_{f.cluster_jobindex}") + ) + # Wait for jobs to finish + [fut.result() for fut in futures] + + # Check that all jobs but one ran successfully + job_states = [] + for job_id in job_ids: + # Show the overall job state (-X), without a header (-n) and without extra whitespace (-P) + stdout, _, exit_code = call(f"sacct -j {job_id} -o State -P -n -X") + assert exit_code == 0 + job_states.append(stdout.strip()) + + # Although one job failed, the other jobs should have continued running and succeeded + assert job_states.count("FAILED") == 1 + assert job_states.count("COMPLETED") == 3 def test_slurm_max_array_size_env() -> None: From 57e0b54d561da87fd52fc78c486f12d4d2fb66ad Mon Sep 17 00:00:00 2001 From: Daniel Werner Date: Sat, 2 Aug 2025 21:12:48 +0200 Subject: [PATCH 2/3] Fix typing --- cluster_tools/tests/test_slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py index ffb6198c7..f18f07ae0 100644 --- a/cluster_tools/tests/test_slurm.py +++ b/cluster_tools/tests/test_slurm.py @@ -402,7 +402,7 @@ def test_slurm_memory_limit( ) for future in futures: future.add_done_callback( - lambda f: job_ids.append(f"{f.cluster_jobid}_{f.cluster_jobindex}") + lambda f: job_ids.append(f"{f.cluster_jobid}_{f.cluster_jobindex}") # type: ignore[attr-defined] ) # Wait for jobs to finish [fut.result() for fut in futures] From d1732fa528fb0c4f5af337c48988f24954e0aa7d Mon Sep 17 00:00:00 2001 From: Daniel Werner Date: Sat, 2 Aug 2025 21:24:43 +0200 Subject: [PATCH 3/3] Update changelog --- cluster_tools/Changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster_tools/Changelog.md b/cluster_tools/Changelog.md index 4c10d6daa..886be821f 100644 --- a/cluster_tools/Changelog.md +++ b/cluster_tools/Changelog.md @@ -16,6 +16,7 @@ For upgrade instructions, please check the respective *Breaking Changes* section ### Changed ### Fixed +- Fixed that all slurm jobs were canceled if a single job failed. [#1347](https://github.com/scalableminds/webknossos-libs/pull/1347) ## [2.4.7](https://github.com/scalableminds/webknossos-libs/releases/tag/v2.4.7) - 2025-07-30