We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent ef54176 commit ef16909Copy full SHA for ef16909
src/zeroband/training/train.py
@@ -57,6 +57,7 @@ def train(config: TrainingConfig):
57
# Optionally, sidecar the orchestrator
58
orchestrator = None
59
if config.orchestrator and world.rank == 0:
60
+ config.orchestrator.num_train_workers = world.world_size
61
logger.info("Starting orchestrator in a separate process")
62
63
# Create a queue for orchestrator to signal when setup is complete
@@ -89,6 +90,7 @@ def train(config: TrainingConfig):
89
90
torch._dynamo.config.suppress_errors = True
91
92
torch.set_float32_matmul_precision("high")
93
+ torch.cuda.set_device(world.rank)
94
95
if config.weights.path and world.rank == 0:
96
if envs.SHARDCAST_OUTPUT_DIR is not None:
0 commit comments