Fix multi gpu trainer (PrimeIntellect-ai#492)

samsja · web-flow · commit ef1690935530 · 2025-07-01T15:03:02.000-07:00
diff --git a/src/zeroband/training/train.py b/src/zeroband/training/train.py
@@ -57,6 +57,7 @@ def train(config: TrainingConfig):
     # Optionally, sidecar the orchestrator
     orchestrator = None
     if config.orchestrator and world.rank == 0:
+        config.orchestrator.num_train_workers = world.world_size
         logger.info("Starting orchestrator in a separate process")
 
         # Create a queue for orchestrator to signal when setup is complete
@@ -89,6 +90,7 @@ def train(config: TrainingConfig):
         torch._dynamo.config.suppress_errors = True
 
     torch.set_float32_matmul_precision("high")
+    torch.cuda.set_device(world.rank)
 
     if config.weights.path and world.rank == 0:
         if envs.SHARDCAST_OUTPUT_DIR is not None: