From 7f19f51e6436fdeeeb5394def56e3dbedd82d9a7 Mon Sep 17 00:00:00 2001
From: Daniel Werner <daniel.werner@scalableminds.com>
Date: Sat, 2 Aug 2025 21:11:47 +0200
Subject: [PATCH 1/3] Fix that all slurm jobs were canceled if a single job
 failed and add regression test

---
 .github/workflows/ci.yml                      |  2 +-
 .../schedulers/cluster_executor.py            |  6 +--
 cluster_tools/tests/test_slurm.py             | 50 +++++++++++++------
 3 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 137883dda..dca946c8a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -79,7 +79,7 @@ jobs:
       - name: Restart Slurm Cluster
         run: cd ./dockered-slurm && docker compose restart slurmctld c1 c2
 
-      - name: "Run Tests (test_all, test_slurm) with modified slurn.conf"
+      - name: "Run Tests (test_all, test_slurm) with modified slurm.conf"
         run: |
           # Run tests requiring a modified slurm config
           docker exec \
diff --git a/cluster_tools/cluster_tools/schedulers/cluster_executor.py b/cluster_tools/cluster_tools/schedulers/cluster_executor.py
index 75c498656..01a022ef2 100644
--- a/cluster_tools/cluster_tools/schedulers/cluster_executor.py
+++ b/cluster_tools/cluster_tools/schedulers/cluster_executor.py
@@ -630,15 +630,15 @@ def register_jobs(
 
     # Overwrite the context manager __exit as it doesn't forward the information whether an exception was thrown or not otherwise
     # which may lead to a deadlock if an exception is thrown within a cluster executor with statement, because self.jobs_empty_cond.wait()
-    # never succeeds.
+    # never succeeds. However, don't fail hard if a RemoteException occurred as we want the other scheduled jobs to keep running.
     def __exit__(
         self,
         exc_type: type[BaseException] | None,
         _exc_val: BaseException | None,
         _exc_tb: TracebackType | None,
     ) -> Literal[False]:
-        # Don't wait if an exception was thrown
-        self.shutdown(wait=exc_type is None)
+        # Don't wait if an exception other than a RemoteException was thrown
+        self.shutdown(wait=exc_type is None or issubclass(exc_type, RemoteException))
         return False
 
     def shutdown(self, wait: bool = True, cancel_futures: bool = True) -> None:
diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py
index 47162fef9..ffb6198c7 100644
--- a/cluster_tools/tests/test_slurm.py
+++ b/cluster_tools/tests/test_slurm.py
@@ -374,7 +374,11 @@ def test_slurm_time_limit() -> None:
 
 
 @pytest.mark.requires_modified_slurm_config
-def test_slurm_memory_limit() -> None:
+def test_slurm_memory_limit(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("SLURM_MAX_RUNNING_SIZE", "2")
+
     # Request 30 MB
     executor = cluster_tools.get_executor(
         "slurm",
@@ -382,20 +386,38 @@ def test_slurm_memory_limit() -> None:
         job_resources={"mem": "30M"},  # 30M is the smallest limit enforced by Cgroups
     )
 
-    with executor:
-        # Schedule a job that allocates more than 30 MB and let it run for more than 1 second
-        # because the frequency of the memory polling is 1 second
-        duration = 3
-        futures = executor.map_to_futures(
-            partial(allocate, duration), [1024 * 1024 * 50]
-        )
-        concurrent.futures.wait(futures)
-
-        # Job should have been killed with a RemoteOutOfMemoryException
-        assert all(
-            isinstance(fut.exception(), cluster_tools.RemoteOutOfMemoryException)
-            for fut in futures
+    job_ids = []
+    with pytest.raises(
+        (
+            cluster_tools.RemoteOutOfMemoryException,
+            cluster_tools.schedulers.cluster_executor.RemoteException,
         )
+    ):
+        with executor:
+            # Schedule a job that allocates more than 30 MB and let it run for more than 1 second
+            # because the frequency of the memory polling is 1 second
+            duration = 3
+            futures = executor.map_to_futures(
+                partial(allocate, duration), [1024 * 1024 * 50, 10, 20, 30]
+            )
+            for future in futures:
+                future.add_done_callback(
+                    lambda f: job_ids.append(f"{f.cluster_jobid}_{f.cluster_jobindex}")
+                )
+            # Wait for jobs to finish
+            [fut.result() for fut in futures]
+
+    # Check that all jobs but one ran successfully
+    job_states = []
+    for job_id in job_ids:
+        # Show the overall job state (-X), without a header (-n) and without extra whitespace (-P)
+        stdout, _, exit_code = call(f"sacct -j {job_id} -o State -P -n -X")
+        assert exit_code == 0
+        job_states.append(stdout.strip())
+
+    # Although one job failed, the other jobs should have continued running and succeeded
+    assert job_states.count("FAILED") == 1
+    assert job_states.count("COMPLETED") == 3
 
 
 def test_slurm_max_array_size_env() -> None:

From 57e0b54d561da87fd52fc78c486f12d4d2fb66ad Mon Sep 17 00:00:00 2001
From: Daniel Werner <daniel.werner@scalableminds.com>
Date: Sat, 2 Aug 2025 21:12:48 +0200
Subject: [PATCH 2/3] Fix typing

---
 cluster_tools/tests/test_slurm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cluster_tools/tests/test_slurm.py b/cluster_tools/tests/test_slurm.py
index ffb6198c7..f18f07ae0 100644
--- a/cluster_tools/tests/test_slurm.py
+++ b/cluster_tools/tests/test_slurm.py
@@ -402,7 +402,7 @@ def test_slurm_memory_limit(
             )
             for future in futures:
                 future.add_done_callback(
-                    lambda f: job_ids.append(f"{f.cluster_jobid}_{f.cluster_jobindex}")
+                    lambda f: job_ids.append(f"{f.cluster_jobid}_{f.cluster_jobindex}")  # type: ignore[attr-defined]
                 )
             # Wait for jobs to finish
             [fut.result() for fut in futures]

From d1732fa528fb0c4f5af337c48988f24954e0aa7d Mon Sep 17 00:00:00 2001
From: Daniel Werner <daniel.werner@scalableminds.com>
Date: Sat, 2 Aug 2025 21:24:43 +0200
Subject: [PATCH 3/3] Update changelog

---
 cluster_tools/Changelog.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cluster_tools/Changelog.md b/cluster_tools/Changelog.md
index 4c10d6daa..886be821f 100644
--- a/cluster_tools/Changelog.md
+++ b/cluster_tools/Changelog.md
@@ -16,6 +16,7 @@ For upgrade instructions, please check the respective *Breaking Changes* section
 ### Changed
 
 ### Fixed
+- Fixed that all slurm jobs were canceled if a single job failed. [#1347](https://github.com/scalableminds/webknossos-libs/pull/1347)
 
 
 ## [2.4.7](https://github.com/scalableminds/webknossos-libs/releases/tag/v2.4.7) - 2025-07-30